docs: description of supported formats and backends (#788)

* chore: remove type-ignore marks for attaching text to non GroupItems After commit b74208 of docling-core, text items can be attached to any NodeItem and therefore the ignore[arg-type] type marks can be removed. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * test: remove unnecessary imports Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * docs: add documentation on supported formats and backends Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * docs: add notebook example with XML backends Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-01-26 08:10:33 +01:00
parent 3be2fb581f
commit c2ae1cc4ca
7 changed files with 1147 additions and 41 deletions
@@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
            if name == self.Element.TITLE.value:
                if text:
                    self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                        text=text,
                    )
                    self.level += 1
@@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
                    abstract_item = self.doc.add_heading(
                        heading_text,
                        level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                    )
                    self.doc.add_text(
                        label=DocItemLabel.PARAGRAPH,
@@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
                claims_item = self.doc.add_heading(
                    heading_text,
                    level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                )
                for text in self.claims:
                    self.doc.add_text(
@@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
                    self.doc.add_text(
                        label=DocItemLabel.PARAGRAPH,
                        text=text,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                    )
                self.text = ""

@@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
                self.parents[self.level + 1] = self.doc.add_heading(
                    text=text,
                    level=self.level,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )
                self.level += 1
                self.text = ""
@@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                self.doc.add_table(
                    data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

        def _apply_style(self, text: str, style_tag: str) -> str:
@@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
                if self.Element.TITLE.value in self.property and text.strip():
                    title = text.strip()
                    self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                        text=title,
                    )
                    self.level += 1
@@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
                    self.parents[self.level + 1] = self.doc.add_heading(
                        text=text.strip(),
                        level=self.level,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                    )
                    self.level += 1

@@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
                claims_item = self.doc.add_heading(
                    heading_text,
                    level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                )
                for text in self.claims:
                    self.doc.add_text(
@@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
                abstract_item = self.doc.add_heading(
                    heading_text,
                    level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                )
                self.doc.add_text(
                    label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
                    self.doc.add_text(
                        label=DocItemLabel.PARAGRAPH,
                        text=paragraph,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                    )
                elif self.Element.CLAIM.value in self.property:
                    # we may need a space after a paragraph in claim text
@@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                self.doc.add_table(
                    data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

        def _apply_style(self, text: str, style_tag: str) -> str:
@@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
        self.parents[self.level + 1] = self.doc.add_heading(
            heading.value,
            level=self.level,
-            parent=self.parents[self.level],  # type: ignore[arg-type]
+            parent=self.parents[self.level],
        )
        self.level += 1

@@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):

        if field == self.Field.TITLE.value:
            self.parents[self.level + 1] = self.doc.add_title(
-                parent=self.parents[self.level], text=value  # type: ignore[arg-type]
+                parent=self.parents[self.level], text=value
            )
            self.level += 1

@@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
                self.doc.add_text(
                    label=DocItemLabel.PARAGRAPH,
                    text=value,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

        elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
            self.doc.add_text(
                label=DocItemLabel.PARAGRAPH,
                text="",
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
            )

        elif (
@@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
                last_claim = self.doc.add_text(
                    label=DocItemLabel.PARAGRAPH,
                    text="",
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

            last_claim.text += f" {value}" if last_claim.text else value
@@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
            self.parents[self.level + 1] = self.doc.add_heading(
                value,
                level=self.level,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
            )
            self.level += 1

@@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
            self.doc.add_text(
                label=DocItemLabel.PARAGRAPH,
                text=value,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
            )

    def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
                title = text.strip()
                if title:
                    self.parents[self.level + 1] = self.doc.add_text(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                        label=DocItemLabel.TITLE,
                        text=title,
                    )
@@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
                    abstract_item = self.doc.add_heading(
                        heading_text,
                        level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                    )
                    self.doc.add_text(
                        label=DocItemLabel.PARAGRAPH,
@@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
                claims_item = self.doc.add_heading(
                    heading_text,
                    level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                )
                for text in self.claims:
                    self.doc.add_text(
@@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
                        self.parents[self.level + 1] = self.doc.add_heading(
                            text=text,
                            level=self.level,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                        )
                        self.level += 1
                    else:
                        self.doc.add_text(
                            label=DocItemLabel.PARAGRAPH,
                            text=text,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                        )
                self.text = ""

@@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                self.doc.add_table(
                    data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

        def _apply_style(self, text: str, style_tag: str) -> str:
@@ -24,6 +24,20 @@ docling https://arxiv.org/pdf/2206.01062

 To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).

+### Supported formats
+
+The document conversion in Docling supports several popular formats, including:
+
+- **PDF** (Portable Document Format): the format developed by Adobe to present documents compatible across application software, hardware, and operating systems.
+- **.docx**, **.xlsx**, **.pptx** (Word, Excel, and PowerPoint): the Open XML formats suppored by Microsof Office.
+- **Markdown**:  a lightweight markup language to add formatting elements to plain text documents.
+- **AsciiDoc**: a plain text markup language for writing technical content.
+- **HTML** (Hypertext Markup Language): the standard markup language for creating web pages.
+- **XHTML** (Extensible Hypertext Markup Language): the XML-based version of HTML.
+- **XML** (Extensible Markup Language): a markup format for storing and transmitting data. Due to its flexibility, Docling requires custom implementations to identify the
+semantics of the data. Currently, Docling supports the parsing of [USPTO](https://www.uspto.gov/patents) patents and [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/) articles.
+
+
 ### Advanced options

 #### Adjust pipeline features
@@ -126,6 +140,32 @@ result = converter.convert(source)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.


+#### Use specific backend converters
+
+By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](#supported-formats)).
+You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example.
+Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
+
+```python
+import urllib.request
+from io import BytesIO
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+url = "https://en.wikipedia.org/wiki/Duck"
+text = urllib.request.urlopen(url).read()
+in_doc = InputDocument(
+    path_or_stream=BytesIO(text),
+    format=InputFormat.HTML,
+    backend=HTMLDocumentBackend,
+    filename="duck.html",
+)
+backend = HTMLDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(text))
+result = backend.convert()
+print(result.export_to_markdown())
+```
+
 ## Chunking

 You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
@@ -77,7 +77,8 @@ nav:
      - "Force full page OCR": examples/full_page_ocr.py
      - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
      - "Accelerator options": examples/run_with_accelerator.py
-      - "Simple translation": examples/translate.py
+      - "Simple translation": examples/translate.py   
+      - examples/backend_xml_rag.ipynb
    - ✂️ Chunking:
      - examples/hybrid_chunking.ipynb
    - 🤖 RAG with AI dev frameworks:
@@ -2,13 +2,8 @@ import json
 import os
 from pathlib import Path

-from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import (
-    ConversionResult,
-    InputDocument,
-    SectionHeaderItem,
-)
+from docling.datamodel.document import ConversionResult
 from docling.document_converter import DocumentConverter

 GENERATE = False
@@ -3,23 +3,16 @@
 import json
 import logging
 import os
-import unittest
 from pathlib import Path
 from tempfile import NamedTemporaryFile

 import pytest
-import yaml
 from docling_core.types import DoclingDocument
 from docling_core.types.doc import DocItemLabel, TableData, TextItem

 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import (
-    ConversionResult,
-    InputDocument,
-    SectionHeaderItem,
-)
-from docling.document_converter import DocumentConverter
+from docling.datamodel.document import InputDocument

 GENERATE: bool = True
 DATA_PATH: Path = Path("./tests/data/uspto/")
@@ -1,5 +1,4 @@
 import json
-import logging
 import os
 from io import BytesIO
 from pathlib import Path