feat: New document picture classifier (#805)

* figure classifier Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com> * gt for e2e tests Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com> * tests Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com> --------- Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>
2025-01-24 18:05:51 +01:00
parent 88a0e66adc
commit 16a218d871
15 changed files with 369 additions and 21 deletions
@@ -221,6 +221,7 @@ class PdfPipelineOptions(PipelineOptions):
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
    do_code_enrichment: bool = False  # True: perform code OCR
    do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
+    do_picture_classification: bool = False  # True: classify pictures in documents

    table_structure_options: TableStructureOptions = TableStructureOptions()
    ocr_options: Union[