feat: New document picture classifier (#805)

* figure classifier

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* gt for e2e tests

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* tests

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

---------

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>
This commit is contained in:
Matteo
2025-01-24 18:05:51 +01:00
committed by GitHub
parent 88a0e66adc
commit 16a218d871
15 changed files with 369 additions and 21 deletions

View File

@@ -221,6 +221,7 @@ class PdfPipelineOptions(PipelineOptions):
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[