feat: Support tableformer model choice (#90)
* Support tableformer model choice Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update datamodel structure Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update docs Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Cleanup Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add test unit for table options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Ensure import backwards-compatibility for PipelineOptions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update README Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Adjust parameters on custom_convert Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Update Dockerfile Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
parent
39977b5631
commit
d6df76f90b
@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
|
||||
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git procps \
|
||||
&& apt-get clean
|
||||
|
||||
# This will install torch with *only* cpu support
|
||||
|
16
README.md
16
README.md
@ -159,6 +159,8 @@ This can improve output quality if you find that multiple columns in extracted t
|
||||
|
||||
|
||||
```python
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
|
||||
pipeline_options = PipelineOptions(do_table_structure=True)
|
||||
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
||||
|
||||
@ -168,6 +170,20 @@ doc_converter = DocumentConverter(
|
||||
)
|
||||
```
|
||||
|
||||
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
|
||||
|
||||
```python
|
||||
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
|
||||
|
||||
pipeline_options = PipelineOptions(do_table_structure=True)
|
||||
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
artifacts_path=artifacts_path,
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
```
|
||||
|
||||
### Impose limits on the document size
|
||||
|
||||
You can limit the file size and number of pages which should be allowed to process per document:
|
||||
|
@ -12,8 +12,9 @@ from docling_core.utils.file import resolve_file_source
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
|
@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import PdfPageBackend
|
||||
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
|
||||
PipelineOptions,
|
||||
TableStructureOptions,
|
||||
)
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
|
||||
stream: BytesIO
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
|
||||
|
||||
class AssembleOptions(BaseModel):
|
||||
keep_page_images: Annotated[
|
||||
bool,
|
||||
|
@ -4,13 +4,13 @@ from pathlib import Path, PurePath
|
||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
from docling_core.types import BaseCell, BaseText
|
||||
from docling_core.types import BoundingBox as DsBoundingBox
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types import TableCell
|
||||
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.doc.base import Figure
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
25
docling/datamodel/pipeline_options.py
Normal file
25
docling/datamodel/pipeline_options.py
Normal file
@ -0,0 +1,25 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
FAST = auto()
|
||||
ACCURATE = auto()
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
mode: TableFormerMode = TableFormerMode.FAST
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DocumentConversionInput,
|
||||
InputDocument,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.ds_glm_model import GlmModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel
|
||||
|
@ -1,4 +1,5 @@
|
||||
import copy
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
import numpy
|
||||
@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
|
||||
TableElement,
|
||||
TableStructurePrediction,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import TableFormerMode
|
||||
|
||||
|
||||
class TableStructureModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.do_cell_matching = config["do_cell_matching"]
|
||||
self.mode = config["mode"]
|
||||
|
||||
self.enabled = config["enabled"]
|
||||
if self.enabled:
|
||||
artifacts_path = config["artifacts_path"]
|
||||
artifacts_path: Path = config["artifacts_path"]
|
||||
|
||||
if self.mode == TableFormerMode.ACCURATE:
|
||||
artifacts_path = artifacts_path / "fat"
|
||||
|
||||
# Third Party
|
||||
import docling_ibm_models.tableformer.common as c
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, List
|
||||
|
||||
from docling.datamodel.base_models import Page, PipelineOptions
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
|
||||
|
||||
class BaseModelPipeline:
|
||||
|
@ -1,6 +1,6 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
@ -32,6 +32,7 @@ class StandardModelPipeline(BaseModelPipeline):
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._table_model_path,
|
||||
"enabled": pipeline_options.do_table_structure,
|
||||
"mode": pipeline_options.table_structure_options.mode,
|
||||
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
||||
}
|
||||
),
|
||||
|
@ -4,7 +4,7 @@ import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
|
@ -82,7 +82,7 @@ def main():
|
||||
# PyPdfium with OCR
|
||||
# -----------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=False
|
||||
# pipeline_options.do_ocr=True
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
|
1166
poetry.lock
generated
1166
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,9 +1,8 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import PipelineOptions
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
@ -5,8 +5,9 @@ import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, PipelineOptions
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
42
tests/test_options.py
Normal file
42
tests/test_options.py
Normal file
@ -0,0 +1,42 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_doc_path():
|
||||
return Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
|
||||
def get_converters_with_table_options():
|
||||
for cell_matching in [True, False]:
|
||||
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = cell_matching
|
||||
pipeline_options.table_structure_options.mode = mode
|
||||
|
||||
converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
yield converter
|
||||
|
||||
|
||||
def test_e2e_conversions(test_doc_path):
|
||||
for converter in get_converters_with_table_options():
|
||||
print(f"converting {test_doc_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(test_doc_path)
|
||||
|
||||
assert doc_result.status == ConversionStatus.SUCCESS
|
Loading…
Reference in New Issue
Block a user