feat: Support tableformer model choice (#90)

* Support tableformer model choice

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update datamodel structure

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update docs

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Cleanup

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add test unit for table options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Ensure import backwards-compatibility for PipelineOptions

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update README

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Adjust parameters on custom_convert

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

* Update Dockerfile

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
Christoph Auer 2024-09-26 21:37:08 +02:00 committed by GitHub
parent 39977b5631
commit d6df76f90b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 711 additions and 592 deletions

View File

@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git procps \
&& apt-get clean
# This will install torch with *only* cpu support

View File

@ -159,6 +159,8 @@ This can improve output quality if you find that multiple columns in extracted t
```python
from docling.datamodel.pipeline_options import PipelineOptions
pipeline_options = PipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
@ -168,6 +170,20 @@ doc_converter = DocumentConverter(
)
```
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
```python
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
pipeline_options = PipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
doc_converter = DocumentConverter(
artifacts_path=artifacts_path,
pipeline_options=pipeline_options,
)
```
### Impose limits on the document size
You can limit the file size and number of pages which should be allowed to process per document:

View File

@ -12,8 +12,9 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")

View File

@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
from docling.backend.abstract_backend import PdfPageBackend
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
PipelineOptions,
TableStructureOptions,
)
class ConversionStatus(str, Enum):
@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
stream: BytesIO
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
class AssembleOptions(BaseModel):
keep_page_images: Annotated[
bool,

View File

@ -4,13 +4,13 @@ from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure
from pydantic import BaseModel
from typing_extensions import deprecated

View File

@ -0,0 +1,25 @@
from enum import Enum, auto
from pydantic import BaseModel
class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()

View File

@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.datamodel.document import (
ConversionResult,
DocumentConversionInput,
InputDocument,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel

View File

@ -1,4 +1,5 @@
import copy
from pathlib import Path
from typing import Iterable, List
import numpy
@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
TableElement,
TableStructurePrediction,
)
from docling.datamodel.pipeline_options import TableFormerMode
class TableStructureModel:
def __init__(self, config):
self.config = config
self.do_cell_matching = config["do_cell_matching"]
self.mode = config["mode"]
self.enabled = config["enabled"]
if self.enabled:
artifacts_path = config["artifacts_path"]
artifacts_path: Path = config["artifacts_path"]
if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat"
# Third Party
import docling_ibm_models.tableformer.common as c

View File

@ -1,7 +1,8 @@
from pathlib import Path
from typing import Callable, Iterable, List
from docling.datamodel.base_models import Page, PipelineOptions
from docling.datamodel.base_models import Page
from docling.datamodel.pipeline_options import PipelineOptions
class BaseModelPipeline:

View File

@ -1,6 +1,6 @@
from pathlib import Path
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
@ -32,6 +32,7 @@ class StandardModelPipeline(BaseModelPipeline):
"artifacts_path": artifacts_path
/ StandardModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"mode": pipeline_options.table_structure_options.mode,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
}
),

View File

@ -4,7 +4,7 @@ import time
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter

View File

@ -82,7 +82,7 @@ def main():
# PyPdfium with OCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True

1166
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,8 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result

View File

@ -5,8 +5,9 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, PipelineOptions
from docling.datamodel.base_models import DocumentStream
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result

42
tests/test_options.py Normal file
View File

@ -0,0 +1,42 @@
from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result
@pytest.fixture
def test_doc_path():
return Path("./tests/data/2206.01062.pdf")
def get_converters_with_table_options():
for cell_matching in [True, False]:
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = cell_matching
pipeline_options.table_structure_options.mode = mode
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
yield converter
def test_e2e_conversions(test_doc_path):
for converter in get_converters_with_table_options():
print(f"converting {test_doc_path}")
doc_result: ConversionResult = converter.convert_single(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS