fix: fix OCR setting for pypdfium, minor refactor (#102)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
f8f2303348
commit
d96b96c848
@ -9,7 +9,6 @@ from typing import Annotated, Iterable, List, Optional
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from pydantic import AnyUrl
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
@ -181,58 +180,25 @@ def convert(
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
|
||||
###########################################################################
|
||||
match backend:
|
||||
case Backend.PYPDFIUM2:
|
||||
do_cell_matching = ocr # only do cell matching when OCR enabled
|
||||
pdf_backend = PyPdfiumDocumentBackend
|
||||
case Backend.DOCLING:
|
||||
do_cell_matching = True
|
||||
pdf_backend = DoclingParseDocumentBackend
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
|
||||
# The following sections contain a combination of PipelineOptions
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
|
||||
doc_converter = None
|
||||
if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=PyPdfiumDocumentBackend,
|
||||
)
|
||||
|
||||
elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=PyPdfiumDocumentBackend,
|
||||
)
|
||||
|
||||
elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
###########################################################################
|
||||
pipeline_options = PipelineOptions(
|
||||
do_ocr=ocr,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=pdf_backend,
|
||||
)
|
||||
|
||||
# Define input files
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
Loading…
Reference in New Issue
Block a user