feat: Support tableformer model choice (#90)

* Support tableformer model choice Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update datamodel structure Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update docs Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Cleanup Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add test unit for table options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Ensure import backwards-compatibility for PipelineOptions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update README Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Adjust parameters on custom_convert Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Update Dockerfile Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
2024-09-26 21:37:08 +02:00 · 2024-09-26 21:37:08 +02:00 · d6df76f90b
commit d6df76f90b
parent 39977b5631
16 changed files with 711 additions and 592 deletions
--- a/2
+++ b/2
@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
 ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"

 RUN apt-get update \
-    && apt-get install -y libgl1 libglib2.0-0 curl wget git \
+    && apt-get install -y libgl1 libglib2.0-0 curl wget git procps \
    && apt-get clean

 # This will install torch with *only* cpu support
--- a/README.md
+++ b/README.md
@ -159,6 +159,8 @@ This can improve output quality if you find that multiple columns in extracted t


 ```python
+from docling.datamodel.pipeline_options import PipelineOptions
+
 pipeline_options = PipelineOptions(do_table_structure=True)
 pipeline_options.table_structure_options.do_cell_matching = False  # uses text cells predicted from table structure model

@ -168,6 +170,20 @@ doc_converter = DocumentConverter(
 )
 ```

+Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
+
+```python
+from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
+
+pipeline_options = PipelineOptions(do_table_structure=True)
+pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model
+
+doc_converter = DocumentConverter(
+    artifacts_path=artifacts_path,
+    pipeline_options=pipeline_options,
+)
+```
+
 ### Impose limits on the document size

 You can limit the file size and number of pages which should be allowed to process per document:
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -12,8 +12,9 @@ from docling_core.utils.file import resolve_file_source

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.document_converter import DocumentConverter

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Self

 from docling.backend.abstract_backend import PdfPageBackend
+from docling.datamodel.pipeline_options import (  # Must be imported here for backward compatibility.
+    PipelineOptions,
+    TableStructureOptions,
+)


 class ConversionStatus(str, Enum):
@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
    stream: BytesIO


-class TableStructureOptions(BaseModel):
-    do_cell_matching: bool = (
-        True
-        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
-        #        are merged across table columns.
-        # False: Let table structure model define the text cells, ignore PDF cells.
-    )
-
-
-class PipelineOptions(BaseModel):
-    do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
-
-    table_structure_options: TableStructureOptions = TableStructureOptions()
-
-
 class AssembleOptions(BaseModel):
    keep_page_images: Annotated[
        bool,
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -4,13 +4,13 @@ from pathlib import Path, PurePath
 from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union

 from docling_core.types import BaseCell, BaseText
-from docling_core.types import BoundingBox as DsBoundingBox
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
+from docling_core.types.doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.doc.base import Figure
 from pydantic import BaseModel
 from typing_extensions import deprecated
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -0,0 +1,25 @@
+from enum import Enum, auto
+
+from pydantic import BaseModel
+
+
+class TableFormerMode(str, Enum):
+    FAST = auto()
+    ACCURATE = auto()
+
+
+class TableStructureOptions(BaseModel):
+    do_cell_matching: bool = (
+        True
+        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
+        #        are merged across table columns.
+        # False: Let table structure model define the text cells, ignore PDF cells.
+    )
+    mode: TableFormerMode = TableFormerMode.FAST
+
+
+class PipelineOptions(BaseModel):
+    do_table_structure: bool = True  # True: perform table structure extraction
+    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+
+    table_structure_options: TableStructureOptions = TableStructureOptions()
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
    DoclingComponentType,
    ErrorItem,
    Page,
-    PipelineOptions,
 )
 from docling.datamodel.document import (
    ConversionResult,
    DocumentConversionInput,
    InputDocument,
 )
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.ds_glm_model import GlmModel
 from docling.models.page_assemble_model import PageAssembleModel
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -1,4 +1,5 @@
 import copy
+from pathlib import Path
 from typing import Iterable, List

 import numpy
@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
    TableElement,
    TableStructurePrediction,
 )
+from docling.datamodel.pipeline_options import TableFormerMode


 class TableStructureModel:
    def __init__(self, config):
        self.config = config
        self.do_cell_matching = config["do_cell_matching"]
+        self.mode = config["mode"]

        self.enabled = config["enabled"]
        if self.enabled:
-            artifacts_path = config["artifacts_path"]
+            artifacts_path: Path = config["artifacts_path"]
+
+            if self.mode == TableFormerMode.ACCURATE:
+                artifacts_path = artifacts_path / "fat"
+
            # Third Party
            import docling_ibm_models.tableformer.common as c

--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@ -1,7 +1,8 @@
 from pathlib import Path
 from typing import Callable, Iterable, List

-from docling.datamodel.base_models import Page, PipelineOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.pipeline_options import PipelineOptions


 class BaseModelPipeline:
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -1,6 +1,6 @@
 from pathlib import Path

-from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
@ -32,6 +32,7 @@ class StandardModelPipeline(BaseModelPipeline):
                    "artifacts_path": artifacts_path
                    / StandardModelPipeline._table_model_path,
                    "enabled": pipeline_options.do_table_structure,
+                    "mode": pipeline_options.table_structure_options.mode,
                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
                }
            ),
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -4,7 +4,7 @@ import time
 from pathlib import Path
 from typing import Iterable

-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
 from docling.document_converter import DocumentConverter

--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -82,7 +82,7 @@ def main():
    # PyPdfium with OCR
    # -----------------
    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=False
+    # pipeline_options.do_ocr=True
    # pipeline_options.do_table_structure=True
    # pipeline_options.table_structure_options.do_cell_matching = True

--- a/poetry.lock
+++ b/poetry.lock
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -1,9 +1,8 @@
 from pathlib import Path

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import PipelineOptions
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.document_converter import DocumentConverter

 from .verify_utils import verify_conversion_result
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@ -5,8 +5,9 @@ import pytest

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import DocumentStream, PipelineOptions
+from docling.datamodel.base_models import DocumentStream
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.document_converter import DocumentConverter

 from .verify_utils import verify_conversion_result
--- a/tests/test_options.py
+++ b/tests/test_options.py
@ -0,0 +1,42 @@
+from pathlib import Path
+
+import pytest
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
+from docling.document_converter import DocumentConverter
+
+from .verify_utils import verify_conversion_result
+
+
+@pytest.fixture
+def test_doc_path():
+    return Path("./tests/data/2206.01062.pdf")
+
+
+def get_converters_with_table_options():
+    for cell_matching in [True, False]:
+        for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
+            pipeline_options = PipelineOptions()
+            pipeline_options.do_ocr = False
+            pipeline_options.do_table_structure = True
+            pipeline_options.table_structure_options.do_cell_matching = cell_matching
+            pipeline_options.table_structure_options.mode = mode
+
+            converter = DocumentConverter(
+                pipeline_options=pipeline_options,
+                pdf_backend=DoclingParseDocumentBackend,
+            )
+
+            yield converter
+
+
+def test_e2e_conversions(test_doc_path):
+    for converter in get_converters_with_table_options():
+        print(f"converting {test_doc_path}")
+
+        doc_result: ConversionResult = converter.convert_single(test_doc_path)
+
+        assert doc_result.status == ConversionStatus.SUCCESS