docs: description of supported formats and backends (#788)
* chore: remove type-ignore marks for attaching text to non GroupItems After commit b74208 of docling-core, text items can be attached to any NodeItem and therefore the ignore[arg-type] type marks can be removed. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * test: remove unnecessary imports Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * docs: add documentation on supported formats and backends Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * docs: add notebook example with XML backends Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
3be2fb581f
commit
c2ae1cc4ca
@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
if name == self.Element.TITLE.value:
|
if name == self.Element.TITLE.value:
|
||||||
if text:
|
if text:
|
||||||
self.parents[self.level + 1] = self.doc.add_title(
|
self.parents[self.level + 1] = self.doc.add_title(
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
abstract_item = self.doc.add_heading(
|
abstract_item = self.doc.add_heading(
|
||||||
heading_text,
|
heading_text,
|
||||||
level=heading_level,
|
level=heading_level,
|
||||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
parent=self.parents[heading_level],
|
||||||
)
|
)
|
||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
claims_item = self.doc.add_heading(
|
claims_item = self.doc.add_heading(
|
||||||
heading_text,
|
heading_text,
|
||||||
level=heading_level,
|
level=heading_level,
|
||||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
parent=self.parents[heading_level],
|
||||||
)
|
)
|
||||||
for text in self.claims:
|
for text in self.claims:
|
||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
text=text,
|
text=text,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
self.text = ""
|
self.text = ""
|
||||||
|
|
||||||
@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
self.parents[self.level + 1] = self.doc.add_heading(
|
self.parents[self.level + 1] = self.doc.add_heading(
|
||||||
text=text,
|
text=text,
|
||||||
level=self.level,
|
level=self.level,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
self.text = ""
|
self.text = ""
|
||||||
@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
||||||
self.doc.add_table(
|
self.doc.add_table(
|
||||||
data=empty_table,
|
data=empty_table,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _apply_style(self, text: str, style_tag: str) -> str:
|
def _apply_style(self, text: str, style_tag: str) -> str:
|
||||||
@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
if self.Element.TITLE.value in self.property and text.strip():
|
if self.Element.TITLE.value in self.property and text.strip():
|
||||||
title = text.strip()
|
title = text.strip()
|
||||||
self.parents[self.level + 1] = self.doc.add_title(
|
self.parents[self.level + 1] = self.doc.add_title(
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
text=title,
|
text=title,
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
self.parents[self.level + 1] = self.doc.add_heading(
|
self.parents[self.level + 1] = self.doc.add_heading(
|
||||||
text=text.strip(),
|
text=text.strip(),
|
||||||
level=self.level,
|
level=self.level,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
claims_item = self.doc.add_heading(
|
claims_item = self.doc.add_heading(
|
||||||
heading_text,
|
heading_text,
|
||||||
level=heading_level,
|
level=heading_level,
|
||||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
parent=self.parents[heading_level],
|
||||||
)
|
)
|
||||||
for text in self.claims:
|
for text in self.claims:
|
||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
abstract_item = self.doc.add_heading(
|
abstract_item = self.doc.add_heading(
|
||||||
heading_text,
|
heading_text,
|
||||||
level=heading_level,
|
level=heading_level,
|
||||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
parent=self.parents[heading_level],
|
||||||
)
|
)
|
||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
|
label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
|
||||||
@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
text=paragraph,
|
text=paragraph,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
elif self.Element.CLAIM.value in self.property:
|
elif self.Element.CLAIM.value in self.property:
|
||||||
# we may need a space after a paragraph in claim text
|
# we may need a space after a paragraph in claim text
|
||||||
@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
||||||
self.doc.add_table(
|
self.doc.add_table(
|
||||||
data=empty_table,
|
data=empty_table,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _apply_style(self, text: str, style_tag: str) -> str:
|
def _apply_style(self, text: str, style_tag: str) -> str:
|
||||||
@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|||||||
self.parents[self.level + 1] = self.doc.add_heading(
|
self.parents[self.level + 1] = self.doc.add_heading(
|
||||||
heading.value,
|
heading.value,
|
||||||
level=self.level,
|
level=self.level,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|||||||
|
|
||||||
if field == self.Field.TITLE.value:
|
if field == self.Field.TITLE.value:
|
||||||
self.parents[self.level + 1] = self.doc.add_title(
|
self.parents[self.level + 1] = self.doc.add_title(
|
||||||
parent=self.parents[self.level], text=value # type: ignore[arg-type]
|
parent=self.parents[self.level], text=value
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
|
|||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
text=value,
|
text=value,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
|
|
||||||
elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
|
elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
|
||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
text="",
|
text="",
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|||||||
last_claim = self.doc.add_text(
|
last_claim = self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
text="",
|
text="",
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
|
|
||||||
last_claim.text += f" {value}" if last_claim.text else value
|
last_claim.text += f" {value}" if last_claim.text else value
|
||||||
@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|||||||
self.parents[self.level + 1] = self.doc.add_heading(
|
self.parents[self.level + 1] = self.doc.add_heading(
|
||||||
value,
|
value,
|
||||||
level=self.level,
|
level=self.level,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
text=value,
|
text=value,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
||||||
@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
title = text.strip()
|
title = text.strip()
|
||||||
if title:
|
if title:
|
||||||
self.parents[self.level + 1] = self.doc.add_text(
|
self.parents[self.level + 1] = self.doc.add_text(
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
label=DocItemLabel.TITLE,
|
label=DocItemLabel.TITLE,
|
||||||
text=title,
|
text=title,
|
||||||
)
|
)
|
||||||
@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
abstract_item = self.doc.add_heading(
|
abstract_item = self.doc.add_heading(
|
||||||
heading_text,
|
heading_text,
|
||||||
level=heading_level,
|
level=heading_level,
|
||||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
parent=self.parents[heading_level],
|
||||||
)
|
)
|
||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
claims_item = self.doc.add_heading(
|
claims_item = self.doc.add_heading(
|
||||||
heading_text,
|
heading_text,
|
||||||
level=heading_level,
|
level=heading_level,
|
||||||
parent=self.parents[heading_level], # type: ignore[arg-type]
|
parent=self.parents[heading_level],
|
||||||
)
|
)
|
||||||
for text in self.claims:
|
for text in self.claims:
|
||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
self.parents[self.level + 1] = self.doc.add_heading(
|
self.parents[self.level + 1] = self.doc.add_heading(
|
||||||
text=text,
|
text=text,
|
||||||
level=self.level,
|
level=self.level,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
else:
|
else:
|
||||||
self.doc.add_text(
|
self.doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
text=text,
|
text=text,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
self.text = ""
|
self.text = ""
|
||||||
|
|
||||||
@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
||||||
self.doc.add_table(
|
self.doc.add_table(
|
||||||
data=empty_table,
|
data=empty_table,
|
||||||
parent=self.parents[self.level], # type: ignore[arg-type]
|
parent=self.parents[self.level],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _apply_style(self, text: str, style_tag: str) -> str:
|
def _apply_style(self, text: str, style_tag: str) -> str:
|
||||||
|
1078
docs/examples/backend_xml_rag.ipynb
Normal file
1078
docs/examples/backend_xml_rag.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
@ -24,6 +24,20 @@ docling https://arxiv.org/pdf/2206.01062
|
|||||||
|
|
||||||
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
|
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
|
||||||
|
|
||||||
|
### Supported formats
|
||||||
|
|
||||||
|
The document conversion in Docling supports several popular formats, including:
|
||||||
|
|
||||||
|
- **PDF** (Portable Document Format): the format developed by Adobe to present documents compatible across application software, hardware, and operating systems.
|
||||||
|
- **.docx**, **.xlsx**, **.pptx** (Word, Excel, and PowerPoint): the Open XML formats suppored by Microsof Office.
|
||||||
|
- **Markdown**: a lightweight markup language to add formatting elements to plain text documents.
|
||||||
|
- **AsciiDoc**: a plain text markup language for writing technical content.
|
||||||
|
- **HTML** (Hypertext Markup Language): the standard markup language for creating web pages.
|
||||||
|
- **XHTML** (Extensible Hypertext Markup Language): the XML-based version of HTML.
|
||||||
|
- **XML** (Extensible Markup Language): a markup format for storing and transmitting data. Due to its flexibility, Docling requires custom implementations to identify the
|
||||||
|
semantics of the data. Currently, Docling supports the parsing of [USPTO](https://www.uspto.gov/patents) patents and [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/) articles.
|
||||||
|
|
||||||
|
|
||||||
### Advanced options
|
### Advanced options
|
||||||
|
|
||||||
#### Adjust pipeline features
|
#### Adjust pipeline features
|
||||||
@ -126,6 +140,32 @@ result = converter.convert(source)
|
|||||||
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
||||||
|
|
||||||
|
|
||||||
|
#### Use specific backend converters
|
||||||
|
|
||||||
|
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](#supported-formats)).
|
||||||
|
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example.
|
||||||
|
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import urllib.request
|
||||||
|
from io import BytesIO
|
||||||
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
url = "https://en.wikipedia.org/wiki/Duck"
|
||||||
|
text = urllib.request.urlopen(url).read()
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=BytesIO(text),
|
||||||
|
format=InputFormat.HTML,
|
||||||
|
backend=HTMLDocumentBackend,
|
||||||
|
filename="duck.html",
|
||||||
|
)
|
||||||
|
backend = HTMLDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(text))
|
||||||
|
result = backend.convert()
|
||||||
|
print(result.export_to_markdown())
|
||||||
|
```
|
||||||
|
|
||||||
## Chunking
|
## Chunking
|
||||||
|
|
||||||
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
|
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
|
||||||
|
@ -77,7 +77,8 @@ nav:
|
|||||||
- "Force full page OCR": examples/full_page_ocr.py
|
- "Force full page OCR": examples/full_page_ocr.py
|
||||||
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
||||||
- "Accelerator options": examples/run_with_accelerator.py
|
- "Accelerator options": examples/run_with_accelerator.py
|
||||||
- "Simple translation": examples/translate.py
|
- "Simple translation": examples/translate.py
|
||||||
|
- examples/backend_xml_rag.ipynb
|
||||||
- ✂️ Chunking:
|
- ✂️ Chunking:
|
||||||
- examples/hybrid_chunking.ipynb
|
- examples/hybrid_chunking.ipynb
|
||||||
- 🤖 RAG with AI dev frameworks:
|
- 🤖 RAG with AI dev frameworks:
|
||||||
|
@ -2,13 +2,8 @@ import json
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import ConversionResult
|
||||||
ConversionResult,
|
|
||||||
InputDocument,
|
|
||||||
SectionHeaderItem,
|
|
||||||
)
|
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
GENERATE = False
|
GENERATE = False
|
||||||
|
@ -3,23 +3,16 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import unittest
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import NamedTemporaryFile
|
from tempfile import NamedTemporaryFile
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import yaml
|
|
||||||
from docling_core.types import DoclingDocument
|
from docling_core.types import DoclingDocument
|
||||||
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
||||||
|
|
||||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import InputDocument
|
||||||
ConversionResult,
|
|
||||||
InputDocument,
|
|
||||||
SectionHeaderItem,
|
|
||||||
)
|
|
||||||
from docling.document_converter import DocumentConverter
|
|
||||||
|
|
||||||
GENERATE: bool = True
|
GENERATE: bool = True
|
||||||
DATA_PATH: Path = Path("./tests/data/uspto/")
|
DATA_PATH: Path = Path("./tests/data/uspto/")
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
Loading…
Reference in New Issue
Block a user