fix: fix OCR setting for pypdfium, minor refactor (#102)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2024-09-24 14:36:00 +02:00
parent f8f2303348
commit d96b96c848
1 changed files with 18 additions and 52 deletions
@@ -9,7 +9,6 @@ from typing import Annotated, Iterable, List, Optional
 import typer
 from docling_core.utils.file import resolve_file_source
 from pydantic import AnyUrl
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -181,58 +180,25 @@ def convert(
        else:
            input_doc_paths.append(source)
-    ###########################################################################
+    match backend:
        case Backend.PYPDFIUM2:
            do_cell_matching = ocr  # only do cell matching when OCR enabled
            pdf_backend = PyPdfiumDocumentBackend
        case Backend.DOCLING:
            do_cell_matching = True
            pdf_backend = DoclingParseDocumentBackend
        case _:
            raise RuntimeError(f"Unexpected backend type {backend}")
-    # The following sections contain a combination of PipelineOptions
+    pipeline_options = PipelineOptions(
-    # and PDF Backends for various configurations.
+        do_ocr=ocr,
-    # Uncomment one section at the time to see the differences in the output.
+        do_table_structure=True,
-
+    )
-    doc_converter = None
+    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
-    if backend == Backend.PYPDFIUM2 and not ocr:  # PyPdfium without OCR
+    doc_converter = DocumentConverter(
-        pipeline_options = PipelineOptions()
+        pipeline_options=pipeline_options,
-        pipeline_options.do_ocr = False
+        pdf_backend=pdf_backend,
-        pipeline_options.do_table_structure = True
+    )
        pipeline_options.table_structure_options.do_cell_matching = False
        doc_converter = DocumentConverter(
            pipeline_options=pipeline_options,
            pdf_backend=PyPdfiumDocumentBackend,
        )
    elif backend == Backend.PYPDFIUM2.value and ocr:  # PyPdfium with OCR
        pipeline_options = PipelineOptions()
        pipeline_options.do_ocr = False
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        doc_converter = DocumentConverter(
            pipeline_options=pipeline_options,
            pdf_backend=PyPdfiumDocumentBackend,
        )
    elif backend == Backend.DOCLING.value and not ocr:  # Docling Parse without OCR
        pipeline_options = PipelineOptions()
        pipeline_options.do_ocr = False
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        doc_converter = DocumentConverter(
            pipeline_options=pipeline_options,
            pdf_backend=DoclingParseDocumentBackend,
        )
    elif backend == Backend.DOCLING.value and ocr:  # Docling Parse with OCR
        pipeline_options = PipelineOptions()
        pipeline_options.do_ocr = True
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        doc_converter = DocumentConverter(
            pipeline_options=pipeline_options,
            pdf_backend=DoclingParseDocumentBackend,
        )
    ###########################################################################
    # Define input files
    input = DocumentConversionInput.from_paths(input_doc_paths)