feat: New document picture classifier (#805)

* figure classifier

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* gt for e2e tests

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* tests

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

---------

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>
This commit is contained in:
Matteo
2025-01-24 18:05:51 +01:00
committed by GitHub
parent 88a0e66adc
commit 16a218d871
15 changed files with 369 additions and 21 deletions

View File

@@ -19,6 +19,10 @@ from docling.datamodel.pipeline_options import (
)
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
from docling.models.document_picture_classifier import (
DocumentPictureClassifier,
DocumentPictureClassifierOptions,
)
from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
@@ -104,6 +108,13 @@ class StandardPdfPipeline(PaginatedPipeline):
),
accelerator_options=pipeline_options.accelerator_options,
),
# Document Picture Classifier
DocumentPictureClassifier(
enabled=pipeline_options.do_picture_classification,
artifacts_path=pipeline_options.artifacts_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options,
),
]
if (