fix: formula conversion with page_range param set (#1791)

When page_range param is used for formula conversion,
the system throws list index out of range error.

Included tests to validate that the fix works.

Signed-off-by: Masum <masumsofts@yahoo.com>
This commit is contained in:
Mahafuzur Rahman 2025-06-17 17:58:45 +06:00 committed by GitHub
parent c2ef69718a
commit dbab30e92c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 23 additions and 1 deletions

View File

@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
coord_origin=bbox.coord_origin,
)
page_ix = element_prov.page_no - 1
page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=expanded_bbox
)

View File

@ -60,3 +60,25 @@ def test_code_and_formula_conversion():
gt = "a ^ { 2 } + 8 = 1 2"
predicted = formula_blocks[0].text
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
def test_formula_conversion_with_page_range():
pdf_path = Path("tests/data/pdf/code_and_formula.pdf")
converter = get_converter()
print(f"converting {pdf_path} with page range")
doc_result: ConversionResult = converter.convert(pdf_path, page_range=(2, 2))
results = doc_result.document.texts
formula_blocks = [
el
for el in results
if isinstance(el, TextItem) and el.label == DocItemLabel.FORMULA
]
assert len(formula_blocks) == 1
gt = "a ^ { 2 } + 8 = 1 2"
predicted = formula_blocks[0].text
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"