docs: description of supported formats and backends (#788)
* chore: remove type-ignore marks for attaching text to non GroupItems After commit b74208 of docling-core, text items can be attached to any NodeItem and therefore the ignore[arg-type] type marks can be removed. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * test: remove unnecessary imports Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * docs: add documentation on supported formats and backends Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * docs: add notebook example with XML backends Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
3be2fb581f
commit
c2ae1cc4ca
@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
if name == self.Element.TITLE.value:
|
||||
if text:
|
||||
self.parents[self.level + 1] = self.doc.add_title(
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
text=text,
|
||||
)
|
||||
self.level += 1
|
||||
@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
abstract_item = self.doc.add_heading(
|
||||
heading_text,
|
||||
level=heading_level,
|
||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
||||
parent=self.parents[heading_level],
|
||||
)
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
claims_item = self.doc.add_heading(
|
||||
heading_text,
|
||||
level=heading_level,
|
||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
||||
parent=self.parents[heading_level],
|
||||
)
|
||||
for text in self.claims:
|
||||
self.doc.add_text(
|
||||
@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text=text,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.text = ""
|
||||
|
||||
@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self.parents[self.level + 1] = self.doc.add_heading(
|
||||
text=text,
|
||||
level=self.level,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
self.text = ""
|
||||
@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
||||
self.doc.add_table(
|
||||
data=empty_table,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
|
||||
def _apply_style(self, text: str, style_tag: str) -> str:
|
||||
@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
if self.Element.TITLE.value in self.property and text.strip():
|
||||
title = text.strip()
|
||||
self.parents[self.level + 1] = self.doc.add_title(
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
text=title,
|
||||
)
|
||||
self.level += 1
|
||||
@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self.parents[self.level + 1] = self.doc.add_heading(
|
||||
text=text.strip(),
|
||||
level=self.level,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
claims_item = self.doc.add_heading(
|
||||
heading_text,
|
||||
level=heading_level,
|
||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
||||
parent=self.parents[heading_level],
|
||||
)
|
||||
for text in self.claims:
|
||||
self.doc.add_text(
|
||||
@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
abstract_item = self.doc.add_heading(
|
||||
heading_text,
|
||||
level=heading_level,
|
||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
||||
parent=self.parents[heading_level],
|
||||
)
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
|
||||
@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text=paragraph,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
elif self.Element.CLAIM.value in self.property:
|
||||
# we may need a space after a paragraph in claim text
|
||||
@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
||||
self.doc.add_table(
|
||||
data=empty_table,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
|
||||
def _apply_style(self, text: str, style_tag: str) -> str:
|
||||
@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
||||
self.parents[self.level + 1] = self.doc.add_heading(
|
||||
heading.value,
|
||||
level=self.level,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
||||
|
||||
if field == self.Field.TITLE.value:
|
||||
self.parents[self.level + 1] = self.doc.add_title(
|
||||
parent=self.parents[self.level], text=value # type: ignore[arg-type]
|
||||
parent=self.parents[self.level], text=value
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text=value,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
|
||||
elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text="",
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
|
||||
elif (
|
||||
@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
||||
last_claim = self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text="",
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
|
||||
last_claim.text += f" {value}" if last_claim.text else value
|
||||
@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
||||
self.parents[self.level + 1] = self.doc.add_heading(
|
||||
value,
|
||||
level=self.level,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text=value,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
|
||||
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
||||
@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
title = text.strip()
|
||||
if title:
|
||||
self.parents[self.level + 1] = self.doc.add_text(
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TITLE,
|
||||
text=title,
|
||||
)
|
||||
@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
abstract_item = self.doc.add_heading(
|
||||
heading_text,
|
||||
level=heading_level,
|
||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
||||
parent=self.parents[heading_level],
|
||||
)
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
claims_item = self.doc.add_heading(
|
||||
heading_text,
|
||||
level=heading_level,
|
||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
||||
parent=self.parents[heading_level],
|
||||
)
|
||||
for text in self.claims:
|
||||
self.doc.add_text(
|
||||
@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self.parents[self.level + 1] = self.doc.add_heading(
|
||||
text=text,
|
||||
level=self.level,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
else:
|
||||
self.doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text=text,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.text = ""
|
||||
|
||||
@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
||||
self.doc.add_table(
|
||||
data=empty_table,
|
||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
|
||||
def _apply_style(self, text: str, style_tag: str) -> str:
|
||||
|
1078
docs/examples/backend_xml_rag.ipynb
Normal file
1078
docs/examples/backend_xml_rag.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
@ -24,6 +24,20 @@ docling https://arxiv.org/pdf/2206.01062
|
||||
|
||||
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
|
||||
|
||||
### Supported formats
|
||||
|
||||
The document conversion in Docling supports several popular formats, including:
|
||||
|
||||
- **PDF** (Portable Document Format): the format developed by Adobe to present documents compatible across application software, hardware, and operating systems.
|
||||
- **.docx**, **.xlsx**, **.pptx** (Word, Excel, and PowerPoint): the Open XML formats suppored by Microsof Office.
|
||||
- **Markdown**: a lightweight markup language to add formatting elements to plain text documents.
|
||||
- **AsciiDoc**: a plain text markup language for writing technical content.
|
||||
- **HTML** (Hypertext Markup Language): the standard markup language for creating web pages.
|
||||
- **XHTML** (Extensible Hypertext Markup Language): the XML-based version of HTML.
|
||||
- **XML** (Extensible Markup Language): a markup format for storing and transmitting data. Due to its flexibility, Docling requires custom implementations to identify the
|
||||
semantics of the data. Currently, Docling supports the parsing of [USPTO](https://www.uspto.gov/patents) patents and [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/) articles.
|
||||
|
||||
|
||||
### Advanced options
|
||||
|
||||
#### Adjust pipeline features
|
||||
@ -126,6 +140,32 @@ result = converter.convert(source)
|
||||
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
||||
|
||||
|
||||
#### Use specific backend converters
|
||||
|
||||
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](#supported-formats)).
|
||||
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example.
|
||||
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
|
||||
|
||||
```python
|
||||
import urllib.request
|
||||
from io import BytesIO
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
url = "https://en.wikipedia.org/wiki/Duck"
|
||||
text = urllib.request.urlopen(url).read()
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=BytesIO(text),
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
filename="duck.html",
|
||||
)
|
||||
backend = HTMLDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(text))
|
||||
result = backend.convert()
|
||||
print(result.export_to_markdown())
|
||||
```
|
||||
|
||||
## Chunking
|
||||
|
||||
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
|
||||
|
@ -77,7 +77,8 @@ nav:
|
||||
- "Force full page OCR": examples/full_page_ocr.py
|
||||
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
||||
- "Accelerator options": examples/run_with_accelerator.py
|
||||
- "Simple translation": examples/translate.py
|
||||
- "Simple translation": examples/translate.py
|
||||
- examples/backend_xml_rag.ipynb
|
||||
- ✂️ Chunking:
|
||||
- examples/hybrid_chunking.ipynb
|
||||
- 🤖 RAG with AI dev frameworks:
|
||||
|
@ -2,13 +2,8 @@ import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE = False
|
||||
|
@ -3,23 +3,16 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from docling_core.types import DoclingDocument
|
||||
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
||||
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
GENERATE: bool = True
|
||||
DATA_PATH: Path = Path("./tests/data/uspto/")
|
||||
|
@ -1,5 +1,4 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
Loading…
Reference in New Issue
Block a user