fix: fix OCR setting for pypdfium, minor refactor (#102)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
f8f2303348
commit
d96b96c848
@ -9,7 +9,6 @@ from typing import Annotated, Iterable, List, Optional
|
|||||||
|
|
||||||
import typer
|
import typer
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_file_source
|
||||||
from pydantic import AnyUrl
|
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
@ -181,58 +180,25 @@ def convert(
|
|||||||
else:
|
else:
|
||||||
input_doc_paths.append(source)
|
input_doc_paths.append(source)
|
||||||
|
|
||||||
###########################################################################
|
match backend:
|
||||||
|
case Backend.PYPDFIUM2:
|
||||||
|
do_cell_matching = ocr # only do cell matching when OCR enabled
|
||||||
|
pdf_backend = PyPdfiumDocumentBackend
|
||||||
|
case Backend.DOCLING:
|
||||||
|
do_cell_matching = True
|
||||||
|
pdf_backend = DoclingParseDocumentBackend
|
||||||
|
case _:
|
||||||
|
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||||
|
|
||||||
# The following sections contain a combination of PipelineOptions
|
pipeline_options = PipelineOptions(
|
||||||
# and PDF Backends for various configurations.
|
do_ocr=ocr,
|
||||||
# Uncomment one section at the time to see the differences in the output.
|
do_table_structure=True,
|
||||||
|
)
|
||||||
doc_converter = None
|
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||||
if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
|
doc_converter = DocumentConverter(
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options=pipeline_options,
|
||||||
pipeline_options.do_ocr = False
|
pdf_backend=pdf_backend,
|
||||||
pipeline_options.do_table_structure = True
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = False
|
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
|
||||||
pipeline_options=pipeline_options,
|
|
||||||
pdf_backend=PyPdfiumDocumentBackend,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
|
|
||||||
pipeline_options = PipelineOptions()
|
|
||||||
pipeline_options.do_ocr = False
|
|
||||||
pipeline_options.do_table_structure = True
|
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
|
||||||
pipeline_options=pipeline_options,
|
|
||||||
pdf_backend=PyPdfiumDocumentBackend,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
|
|
||||||
pipeline_options = PipelineOptions()
|
|
||||||
pipeline_options.do_ocr = False
|
|
||||||
pipeline_options.do_table_structure = True
|
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
|
||||||
pipeline_options=pipeline_options,
|
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
|
|
||||||
pipeline_options = PipelineOptions()
|
|
||||||
pipeline_options.do_ocr = True
|
|
||||||
pipeline_options.do_table_structure = True
|
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
|
||||||
pipeline_options=pipeline_options,
|
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
|
||||||
)
|
|
||||||
|
|
||||||
###########################################################################
|
|
||||||
|
|
||||||
# Define input files
|
# Define input files
|
||||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||||
|
Loading…
Reference in New Issue
Block a user