docs: description of supported formats and backends (#788)

* chore: remove type-ignore marks for attaching text to non GroupItems

After commit b74208 of docling-core, text items can be attached to any NodeItem
and therefore the ignore[arg-type] type marks can be removed.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* test: remove unnecessary imports

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* docs: add documentation on supported formats and backends

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* docs: add notebook example with XML backends

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-01-26 08:10:33 +01:00 committed by GitHub
parent 3be2fb581f
commit c2ae1cc4ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 1147 additions and 41 deletions

View File

@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
if name == self.Element.TITLE.value: if name == self.Element.TITLE.value:
if text: if text:
self.parents[self.level + 1] = self.doc.add_title( self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
text=text, text=text,
) )
self.level += 1 self.level += 1
@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
abstract_item = self.doc.add_heading( abstract_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
claims_item = self.doc.add_heading( claims_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
for text in self.claims: for text in self.claims:
self.doc.add_text( self.doc.add_text(
@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=text, text=text,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.text = "" self.text = ""
@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
text=text, text=text,
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
self.text = "" self.text = ""
@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[]) empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table( self.doc.add_table(
data=empty_table, data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
def _apply_style(self, text: str, style_tag: str) -> str: def _apply_style(self, text: str, style_tag: str) -> str:
@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
if self.Element.TITLE.value in self.property and text.strip(): if self.Element.TITLE.value in self.property and text.strip():
title = text.strip() title = text.strip()
self.parents[self.level + 1] = self.doc.add_title( self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
text=title, text=title,
) )
self.level += 1 self.level += 1
@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
text=text.strip(), text=text.strip(),
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
claims_item = self.doc.add_heading( claims_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
for text in self.claims: for text in self.claims:
self.doc.add_text( self.doc.add_text(
@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
abstract_item = self.doc.add_heading( abstract_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=paragraph, text=paragraph,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
elif self.Element.CLAIM.value in self.property: elif self.Element.CLAIM.value in self.property:
# we may need a space after a paragraph in claim text # we may need a space after a paragraph in claim text
@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[]) empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table( self.doc.add_table(
data=empty_table, data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
def _apply_style(self, text: str, style_tag: str) -> str: def _apply_style(self, text: str, style_tag: str) -> str:
@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
heading.value, heading.value,
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
if field == self.Field.TITLE.value: if field == self.Field.TITLE.value:
self.parents[self.level + 1] = self.doc.add_title( self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], text=value # type: ignore[arg-type] parent=self.parents[self.level], text=value
) )
self.level += 1 self.level += 1
@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=value, text=value,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value: elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text="", text="",
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
elif ( elif (
@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
last_claim = self.doc.add_text( last_claim = self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text="", text="",
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
last_claim.text += f" {value}" if last_claim.text else value last_claim.text += f" {value}" if last_claim.text else value
@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
value, value,
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=value, text=value,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
def parse(self, patent_content: str) -> Optional[DoclingDocument]: def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
title = text.strip() title = text.strip()
if title: if title:
self.parents[self.level + 1] = self.doc.add_text( self.parents[self.level + 1] = self.doc.add_text(
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
label=DocItemLabel.TITLE, label=DocItemLabel.TITLE,
text=title, text=title,
) )
@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
abstract_item = self.doc.add_heading( abstract_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
claims_item = self.doc.add_heading( claims_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
for text in self.claims: for text in self.claims:
self.doc.add_text( self.doc.add_text(
@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
text=text, text=text,
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
else: else:
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=text, text=text,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.text = "" self.text = ""
@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[]) empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table( self.doc.add_table(
data=empty_table, data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
def _apply_style(self, text: str, style_tag: str) -> str: def _apply_style(self, text: str, style_tag: str) -> str:

File diff suppressed because it is too large Load Diff

View File

@ -24,6 +24,20 @@ docling https://arxiv.org/pdf/2206.01062
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md). To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
### Supported formats
The document conversion in Docling supports several popular formats, including:
- **PDF** (Portable Document Format): the format developed by Adobe to present documents compatible across application software, hardware, and operating systems.
- **.docx**, **.xlsx**, **.pptx** (Word, Excel, and PowerPoint): the Open XML formats suppored by Microsof Office.
- **Markdown**: a lightweight markup language to add formatting elements to plain text documents.
- **AsciiDoc**: a plain text markup language for writing technical content.
- **HTML** (Hypertext Markup Language): the standard markup language for creating web pages.
- **XHTML** (Extensible Hypertext Markup Language): the XML-based version of HTML.
- **XML** (Extensible Markup Language): a markup format for storing and transmitting data. Due to its flexibility, Docling requires custom implementations to identify the
semantics of the data. Currently, Docling supports the parsing of [USPTO](https://www.uspto.gov/patents) patents and [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/) articles.
### Advanced options ### Advanced options
#### Adjust pipeline features #### Adjust pipeline features
@ -126,6 +140,32 @@ result = converter.convert(source)
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads. You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
#### Use specific backend converters
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](#supported-formats)).
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example.
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
```python
import urllib.request
from io import BytesIO
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
url = "https://en.wikipedia.org/wiki/Duck"
text = urllib.request.urlopen(url).read()
in_doc = InputDocument(
path_or_stream=BytesIO(text),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="duck.html",
)
backend = HTMLDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(text))
result = backend.convert()
print(result.export_to_markdown())
```
## Chunking ## Chunking
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a

View File

@ -77,7 +77,8 @@ nav:
- "Force full page OCR": examples/full_page_ocr.py - "Force full page OCR": examples/full_page_ocr.py
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
- "Accelerator options": examples/run_with_accelerator.py - "Accelerator options": examples/run_with_accelerator.py
- "Simple translation": examples/translate.py - "Simple translation": examples/translate.py
- examples/backend_xml_rag.ipynb
- ✂️ Chunking: - ✂️ Chunking:
- examples/hybrid_chunking.ipynb - examples/hybrid_chunking.ipynb
- 🤖 RAG with AI dev frameworks: - 🤖 RAG with AI dev frameworks:

View File

@ -2,13 +2,8 @@ import json
import os import os
from pathlib import Path from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import ConversionResult
ConversionResult,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
GENERATE = False GENERATE = False

View File

@ -3,23 +3,16 @@
import json import json
import logging import logging
import os import os
import unittest
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
import pytest import pytest
import yaml
from docling_core.types import DoclingDocument from docling_core.types import DoclingDocument
from docling_core.types.doc import DocItemLabel, TableData, TextItem from docling_core.types.doc import DocItemLabel, TableData, TextItem
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import InputDocument
ConversionResult,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
GENERATE: bool = True GENERATE: bool = True
DATA_PATH: Path = Path("./tests/data/uspto/") DATA_PATH: Path = Path("./tests/data/uspto/")

View File

@ -1,5 +1,4 @@
import json import json
import logging
import os import os
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path