fix: enrichment models batch size and expose picture classifier (#878)

* expose picture classifier in CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use different batch size in each model

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove batch size from CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* cleanup imports

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-05 11:46:01 +01:00 committed by GitHub
parent 17448163e7
commit 5ad6de0560
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 13 additions and 1 deletions

View File

@ -219,6 +219,13 @@ def convert(
bool, bool,
typer.Option(..., help="Enable the formula enrichment model in the pipeline."), typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
] = False, ] = False,
enrich_picture_classes: Annotated[
bool,
typer.Option(
...,
help="Enable the picture classification enrichment model in the pipeline.",
),
] = False,
artifacts_path: Annotated[ artifacts_path: Annotated[
Optional[Path], Optional[Path],
typer.Option(..., help="If provided, the location of the model artifacts."), typer.Option(..., help="If provided, the location of the model artifacts."),
@ -375,6 +382,7 @@ def convert(
do_table_structure=True, do_table_structure=True,
do_code_enrichment=enrich_code, do_code_enrichment=enrich_code,
do_formula_enrichment=enrich_formula, do_formula_enrichment=enrich_formula,
do_picture_classification=enrich_picture_classes,
document_timeout=document_timeout, document_timeout=document_timeout,
) )
pipeline_options.table_structure_options.do_cell_matching = ( pipeline_options.table_structure_options.do_cell_matching = (

View File

@ -6,6 +6,7 @@ from typing_extensions import TypeVar
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
class BasePageModel(ABC): class BasePageModel(ABC):
@ -21,6 +22,8 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]): class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
elements_batch_size: int = settings.perf.elements_batch_size
@abstractmethod @abstractmethod
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
pass pass

View File

@ -61,6 +61,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
Processes the given batch of elements and enriches them with predictions. Processes the given batch of elements and enriches them with predictions.
""" """
elements_batch_size = 5
images_scale = 1.66 # = 120 dpi, aligned with training data resolution images_scale = 1.66 # = 120 dpi, aligned with training data resolution
expansion_factor = 0.03 expansion_factor = 0.03

View File

@ -79,7 +79,7 @@ class BasePipeline(ABC):
for model in self.enrichment_pipe: for model in self.enrichment_pipe:
for element_batch in chunkify( for element_batch in chunkify(
_prepare_elements(conv_res, model), _prepare_elements(conv_res, model),
settings.perf.elements_batch_size, model.elements_batch_size,
): ):
for element in model( for element in model(
doc=conv_res.document, element_batch=element_batch doc=conv_res.document, element_batch=element_batch