from pathlib import Path import pytest from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode from docling.document_converter import DocumentConverter, PdfFormatOption @pytest.fixture def test_doc_path(): return Path("./tests/data/2206.01062.pdf") def get_converters_with_table_options(): for cell_matching in [True, False]: for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]: pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = cell_matching pipeline_options.table_structure_options.mode = mode converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend, ) } ) yield converter def test_e2e_conversions(test_doc_path): for converter in get_converters_with_table_options(): print(f"converting {test_doc_path}") doc_result: ConversionResult = converter.convert(test_doc_path) assert doc_result.status == ConversionStatus.SUCCESS