From dbab30e92cc1d130ce7f9335ab9c46aa7a30930d Mon Sep 17 00:00:00 2001 From: Mahafuzur Rahman Date: Tue, 17 Jun 2025 17:58:45 +0600 Subject: [PATCH] fix: formula conversion with page_range param set (#1791) When page_range param is used for formula conversion, the system throws list index out of range error. Included tests to validate that the fix works. Signed-off-by: Masum --- docling/models/base_model.py | 2 +- tests/test_code_formula.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/docling/models/base_model.py b/docling/models/base_model.py index 04df812..b0a43f4 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel( coord_origin=bbox.coord_origin, ) - page_ix = element_prov.page_no - 1 + page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1 cropped_image = conv_res.pages[page_ix].get_image( scale=self.images_scale, cropbox=expanded_bbox ) diff --git a/tests/test_code_formula.py b/tests/test_code_formula.py index e5d52da..af031c9 100644 --- a/tests/test_code_formula.py +++ b/tests/test_code_formula.py @@ -60,3 +60,25 @@ def test_code_and_formula_conversion(): gt = "a ^ { 2 } + 8 = 1 2" predicted = formula_blocks[0].text assert predicted == gt, f"mismatch in text {predicted=}, {gt=}" + + +def test_formula_conversion_with_page_range(): + pdf_path = Path("tests/data/pdf/code_and_formula.pdf") + converter = get_converter() + + print(f"converting {pdf_path} with page range") + + doc_result: ConversionResult = converter.convert(pdf_path, page_range=(2, 2)) + + results = doc_result.document.texts + + formula_blocks = [ + el + for el in results + if isinstance(el, TextItem) and el.label == DocItemLabel.FORMULA + ] + assert len(formula_blocks) == 1 + + gt = "a ^ { 2 } + 8 = 1 2" + predicted = formula_blocks[0].text + assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"