From 39429231257501ed6951acd163973adf478503ea Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Wed, 28 May 2025 17:55:31 +0200 Subject: [PATCH] chore: fix or ignore runtime and deprecation warnings (#1660) * chore: fix or catch deprecation warnings Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: update poetry lock with latest docling-core Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/models/layout_model.py | 22 +++- docling/models/page_preprocessing_model.py | 17 ++- docling/pipeline/standard_pdf_pipeline.py | 122 +++++++++++---------- poetry.lock | 9 +- tests/test_backend_csv.py | 11 +- tests/test_legacy_format_transform.py | 14 +-- tests/verify_utils.py | 8 +- 7 files changed, 116 insertions(+), 87 deletions(-) diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index e2abb37..03a047f 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -185,13 +185,23 @@ class LayoutModel(BasePageModel): ).postprocess() # processed_clusters, processed_cells = clusters, page.cells - conv_res.confidence.pages[page.page_no].layout_score = float( - np.mean([c.confidence for c in processed_clusters]) - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Mean of empty slice|invalid value encountered in scalar divide", + RuntimeWarning, + "numpy", + ) - conv_res.confidence.pages[page.page_no].ocr_score = float( - np.mean([c.confidence for c in processed_cells if c.from_ocr]) - ) + conv_res.confidence.pages[page.page_no].layout_score = float( + np.mean([c.confidence for c in processed_clusters]) + ) + + conv_res.confidence.pages[page.page_no].ocr_score = float( + np.mean( + [c.confidence for c in processed_cells if c.from_ocr] + ) + ) page.cells = processed_cells page.predictions.layout = LayoutPrediction( diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 6a1dcf1..3cfa635 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,4 +1,5 @@ import re +import warnings from collections.abc import Iterable from pathlib import Path from typing import Optional @@ -7,7 +8,7 @@ import numpy as np from PIL import ImageDraw from pydantic import BaseModel -from docling.datamodel.base_models import Page, ScoreValue +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel @@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel): score = self.rate_text_quality(c.text) text_scores.append(score) - conv_res.confidence.pages[page.page_no].parse_score = float( - np.nanquantile( - text_scores, q=0.10 - ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Mean of empty slice", RuntimeWarning, "numpy" + ) + conv_res.confidence.pages[page.page_no].parse_score = float( + np.nanquantile( + text_scores, q=0.10 + ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. + ) # DEBUG code: def draw_text_boxes(image, cells, show: bool = False): diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 4269900..88317fd 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores +from docling.datamodel.base_models import AssembledUnit, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.settings import settings @@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline): "When defined, it must point to a folder containing all models required by the pipeline." ) - self.keep_images = ( - self.pipeline_options.generate_page_images - or self.pipeline_options.generate_picture_images - or self.pipeline_options.generate_table_images - ) + with warnings.catch_warnings(): # deprecated generate_table_images + warnings.filterwarnings("ignore", category=DeprecationWarning) + self.keep_images = ( + self.pipeline_options.generate_page_images + or self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ) self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) @@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline): ) # Generate images of the requested element types - if ( - self.pipeline_options.generate_picture_images - or self.pipeline_options.generate_table_images - ): - scale = self.pipeline_options.images_scale - for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: - continue - if ( - isinstance(element, PictureItem) - and self.pipeline_options.generate_picture_images - ) or ( - isinstance(element, TableItem) - and self.pipeline_options.generate_table_images - ): - page_ix = element.prov[0].page_no - 1 - page = next( - (p for p in conv_res.pages if p.page_no == page_ix), - cast("Page", None), - ) - assert page is not None - assert page.size is not None - assert page.image is not None + with warnings.catch_warnings(): # deprecated generate_table_images + warnings.filterwarnings("ignore", category=DeprecationWarning) + if ( + self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ): + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if not isinstance(element, DocItem) or len(element.prov) == 0: + continue + if ( + isinstance(element, PictureItem) + and self.pipeline_options.generate_picture_images + ) or ( + isinstance(element, TableItem) + and self.pipeline_options.generate_table_images + ): + page_ix = element.prov[0].page_no - 1 + page = next( + (p for p in conv_res.pages if p.page_no == page_ix), + cast("Page", None), + ) + assert page is not None + assert page.size is not None + assert page.image is not None - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) - ) + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin( + page_height=page.size.height * scale + ) + ) - cropped_im = page.image.crop(crop_bbox.as_tuple()) - element.image = ImageRef.from_pil( - cropped_im, dpi=int(72 * scale) - ) + cropped_im = page.image.crop(crop_bbox.as_tuple()) + element.image = ImageRef.from_pil( + cropped_im, dpi=int(72 * scale) + ) # Aggregate confidence values for document: if len(conv_res.pages) > 0: - conv_res.confidence.layout_score = float( - np.nanmean( - [c.layout_score for c in conv_res.confidence.pages.values()] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=RuntimeWarning, + message="Mean of empty slice|All-NaN slice encountered", ) - ) - conv_res.confidence.parse_score = float( - np.nanquantile( - [c.parse_score for c in conv_res.confidence.pages.values()], - q=0.1, # parse score should relate to worst 10% of pages. + conv_res.confidence.layout_score = float( + np.nanmean( + [c.layout_score for c in conv_res.confidence.pages.values()] + ) ) - ) - conv_res.confidence.table_score = float( - np.nanmean( - [c.table_score for c in conv_res.confidence.pages.values()] + conv_res.confidence.parse_score = float( + np.nanquantile( + [c.parse_score for c in conv_res.confidence.pages.values()], + q=0.1, # parse score should relate to worst 10% of pages. + ) ) - ) - conv_res.confidence.ocr_score = float( - np.nanmean( - [c.ocr_score for c in conv_res.confidence.pages.values()] + conv_res.confidence.table_score = float( + np.nanmean( + [c.table_score for c in conv_res.confidence.pages.values()] + ) + ) + conv_res.confidence.ocr_score = float( + np.nanmean( + [c.ocr_score for c in conv_res.confidence.pages.values()] + ) ) - ) return conv_res diff --git a/poetry.lock b/poetry.lock index 12c1b15..0d4e1f0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1018,15 +1018,15 @@ files = [ [[package]] name = "docling-core" -version = "2.31.2" +version = "2.32.0" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" groups = ["main"] markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\"" files = [ - {file = "docling_core-2.31.2-py3-none-any.whl", hash = "sha256:a6db62ac49febcc9e3e24a9acf58e88342ad7f76ab03217b6a3365509eb12eda"}, - {file = "docling_core-2.31.2.tar.gz", hash = "sha256:6d61863ce492affc45aa522c291631db0be7c50dc146cb93c42af7ff00bd22a2"}, + {file = "docling_core-2.32.0-py3-none-any.whl", hash = "sha256:6c643b45a18c5ed8cecf12d1eeeb7ff677dcfdb24fa4aa88122e3c9cc2aeb58d"}, + {file = "docling_core-2.32.0.tar.gz", hash = "sha256:3ec21461f309540bd8bf4880f6c2f0144f6895988102a4204ca5549c76a945c8"}, ] [package.dependencies] @@ -2640,11 +2640,8 @@ files = [ {file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"}, {file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"}, - {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"}, - {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"}, - {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"}, {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"}, {file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"}, {file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"}, diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py index d929ae1..f7b5d30 100644 --- a/tests/test_backend_csv.py +++ b/tests/test_backend_csv.py @@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions(): print(f"converting {csv_path}") gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name - - conv_result: ConversionResult = converter.convert(csv_path) + if csv_path.stem in ( + "csv-too-few-columns", + "csv-too-many-columns", + "csv-inconsistent-header", + ): + with warns(UserWarning, match="Inconsistent column lengths"): + conv_result: ConversionResult = converter.convert(csv_path) + else: + conv_result: ConversionResult = converter.convert(csv_path) doc: DoclingDocument = conv_result.document diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index caef8ff..73c73c5 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -38,17 +38,15 @@ def get_converter(): def test_compare_legacy_output(test_doc_paths): converter = get_converter() - res = converter.convert_all(test_doc_paths, raises_on_error=True) - for conv_res in res: print(f"Results for {conv_res.input.file}") - print( - json.dumps( - conv_res.legacy_document.model_dump( - mode="json", by_alias=True, exclude_none=True + with pytest.warns(DeprecationWarning, match="Use document instead"): + print( + json.dumps( + conv_res.legacy_document.model_dump( + mode="json", by_alias=True, exclude_none=True + ) ) ) - ) - # assert res.legacy_output == res.legacy_output_transformed diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 46a46ac..0db5350 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -4,6 +4,7 @@ import warnings from pathlib import Path from typing import List, Optional +import pytest from docling_core.types.doc import ( DocItem, DoclingDocument, @@ -302,9 +303,8 @@ def verify_conversion_result_v1( ) doc_pred_pages: List[Page] = doc_result.pages - doc_pred: DsDocument = doc_result.legacy_document - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) + with pytest.warns(DeprecationWarning, match="Use document instead"): + doc_pred: DsDocument = doc_result.legacy_document doc_pred_md = doc_result.legacy_document.export_to_markdown() doc_pred_dt = doc_result.legacy_document.export_to_document_tokens() @@ -391,7 +391,7 @@ def verify_conversion_result_v2( doc_pred_pages: List[Page] = doc_result.pages doc_pred: DoclingDocument = doc_result.document doc_pred_md = doc_result.document.export_to_markdown() - doc_pred_dt = doc_result.document.export_to_document_tokens() + doc_pred_dt = doc_result.document.export_to_doctags() engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"