From d96b96c8481a8ae68545a34aaf9b8d5a6637a6be Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:36:00 +0200
Subject: [PATCH] fix: fix OCR setting for pypdfium, minor refactor (#102)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docling/cli/main.py | 70 ++++++++++++---------------------------------
 1 file changed, 18 insertions(+), 52 deletions(-)

diff --git a/docling/cli/main.py b/docling/cli/main.py
index 8ced2b4..894e9ab 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -9,7 +9,6 @@ from typing import Annotated, Iterable, List, Optional
 
 import typer
 from docling_core.utils.file import resolve_file_source
-from pydantic import AnyUrl
 
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -181,58 +180,25 @@ def convert(
         else:
             input_doc_paths.append(source)
 
-    ###########################################################################
+    match backend:
+        case Backend.PYPDFIUM2:
+            do_cell_matching = ocr  # only do cell matching when OCR enabled
+            pdf_backend = PyPdfiumDocumentBackend
+        case Backend.DOCLING:
+            do_cell_matching = True
+            pdf_backend = DoclingParseDocumentBackend
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")
 
-    # The following sections contain a combination of PipelineOptions
-    # and PDF Backends for various configurations.
-    # Uncomment one section at the time to see the differences in the output.
-
-    doc_converter = None
-    if backend == Backend.PYPDFIUM2 and not ocr:  # PyPdfium without OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = False
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=PyPdfiumDocumentBackend,
-        )
-
-    elif backend == Backend.PYPDFIUM2.value and ocr:  # PyPdfium with OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=PyPdfiumDocumentBackend,
-        )
-
-    elif backend == Backend.DOCLING.value and not ocr:  # Docling Parse without OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=DoclingParseDocumentBackend,
-        )
-
-    elif backend == Backend.DOCLING.value and ocr:  # Docling Parse with OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = True
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=DoclingParseDocumentBackend,
-        )
-
-    ###########################################################################
+    pipeline_options = PipelineOptions(
+        do_ocr=ocr,
+        do_table_structure=True,
+    )
+    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=pdf_backend,
+    )
 
     # Define input files
     input = DocumentConversionInput.from_paths(input_doc_paths)