chore: fix or ignore runtime and deprecation warnings (#1660)

* chore: fix or catch deprecation warnings

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* chore: update poetry lock with latest docling-core

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-05-28 17:55:31 +02:00 committed by GitHub
parent b3e0042813
commit 3942923125
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 116 additions and 87 deletions

View File

@ -185,13 +185,23 @@ class LayoutModel(BasePageModel):
).postprocess() ).postprocess()
# processed_clusters, processed_cells = clusters, page.cells # processed_clusters, processed_cells = clusters, page.cells
conv_res.confidence.pages[page.page_no].layout_score = float( with warnings.catch_warnings():
np.mean([c.confidence for c in processed_clusters]) warnings.filterwarnings(
) "ignore",
"Mean of empty slice|invalid value encountered in scalar divide",
RuntimeWarning,
"numpy",
)
conv_res.confidence.pages[page.page_no].ocr_score = float( conv_res.confidence.pages[page.page_no].layout_score = float(
np.mean([c.confidence for c in processed_cells if c.from_ocr]) np.mean([c.confidence for c in processed_clusters])
) )
conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean(
[c.confidence for c in processed_cells if c.from_ocr]
)
)
page.cells = processed_cells page.cells = processed_cells
page.predictions.layout = LayoutPrediction( page.predictions.layout = LayoutPrediction(

View File

@ -1,4 +1,5 @@
import re import re
import warnings
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -7,7 +8,7 @@ import numpy as np
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.base_models import Page, ScoreValue from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel):
score = self.rate_text_quality(c.text) score = self.rate_text_quality(c.text)
text_scores.append(score) text_scores.append(score)
conv_res.confidence.pages[page.page_no].parse_score = float( with warnings.catch_warnings():
np.nanquantile( warnings.filterwarnings(
text_scores, q=0.10 "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. )
) conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanquantile(
text_scores, q=0.10
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
)
# DEBUG code: # DEBUG code:
def draw_text_boxes(image, cells, show: bool = False): def draw_text_boxes(image, cells, show: bool = False):

View File

@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline):
"When defined, it must point to a folder containing all models required by the pipeline." "When defined, it must point to a folder containing all models required by the pipeline."
) )
self.keep_images = ( with warnings.catch_warnings(): # deprecated generate_table_images
self.pipeline_options.generate_page_images warnings.filterwarnings("ignore", category=DeprecationWarning)
or self.pipeline_options.generate_picture_images self.keep_images = (
or self.pipeline_options.generate_table_images self.pipeline_options.generate_page_images
) or self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
)
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
) )
# Generate images of the requested element types # Generate images of the requested element types
if ( with warnings.catch_warnings(): # deprecated generate_table_images
self.pipeline_options.generate_picture_images warnings.filterwarnings("ignore", category=DeprecationWarning)
or self.pipeline_options.generate_table_images if (
): self.pipeline_options.generate_picture_images
scale = self.pipeline_options.images_scale or self.pipeline_options.generate_table_images
for element, _level in conv_res.document.iterate_items(): ):
if not isinstance(element, DocItem) or len(element.prov) == 0: scale = self.pipeline_options.images_scale
continue for element, _level in conv_res.document.iterate_items():
if ( if not isinstance(element, DocItem) or len(element.prov) == 0:
isinstance(element, PictureItem) continue
and self.pipeline_options.generate_picture_images if (
) or ( isinstance(element, PictureItem)
isinstance(element, TableItem) and self.pipeline_options.generate_picture_images
and self.pipeline_options.generate_table_images ) or (
): isinstance(element, TableItem)
page_ix = element.prov[0].page_no - 1 and self.pipeline_options.generate_table_images
page = next( ):
(p for p in conv_res.pages if p.page_no == page_ix), page_ix = element.prov[0].page_no - 1
cast("Page", None), page = next(
) (p for p in conv_res.pages if p.page_no == page_ix),
assert page is not None cast("Page", None),
assert page.size is not None )
assert page.image is not None assert page is not None
assert page.size is not None
assert page.image is not None
crop_bbox = ( crop_bbox = (
element.prov[0] element.prov[0]
.bbox.scaled(scale=scale) .bbox.scaled(scale=scale)
.to_top_left_origin(page_height=page.size.height * scale) .to_top_left_origin(
) page_height=page.size.height * scale
)
)
cropped_im = page.image.crop(crop_bbox.as_tuple()) cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil( element.image = ImageRef.from_pil(
cropped_im, dpi=int(72 * scale) cropped_im, dpi=int(72 * scale)
) )
# Aggregate confidence values for document: # Aggregate confidence values for document:
if len(conv_res.pages) > 0: if len(conv_res.pages) > 0:
conv_res.confidence.layout_score = float( with warnings.catch_warnings():
np.nanmean( warnings.filterwarnings(
[c.layout_score for c in conv_res.confidence.pages.values()] "ignore",
category=RuntimeWarning,
message="Mean of empty slice|All-NaN slice encountered",
) )
) conv_res.confidence.layout_score = float(
conv_res.confidence.parse_score = float( np.nanmean(
np.nanquantile( [c.layout_score for c in conv_res.confidence.pages.values()]
[c.parse_score for c in conv_res.confidence.pages.values()], )
q=0.1, # parse score should relate to worst 10% of pages.
) )
) conv_res.confidence.parse_score = float(
conv_res.confidence.table_score = float( np.nanquantile(
np.nanmean( [c.parse_score for c in conv_res.confidence.pages.values()],
[c.table_score for c in conv_res.confidence.pages.values()] q=0.1, # parse score should relate to worst 10% of pages.
)
) )
) conv_res.confidence.table_score = float(
conv_res.confidence.ocr_score = float( np.nanmean(
np.nanmean( [c.table_score for c in conv_res.confidence.pages.values()]
[c.ocr_score for c in conv_res.confidence.pages.values()] )
)
conv_res.confidence.ocr_score = float(
np.nanmean(
[c.ocr_score for c in conv_res.confidence.pages.values()]
)
) )
)
return conv_res return conv_res

9
poetry.lock generated
View File

@ -1018,15 +1018,15 @@ files = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "2.31.2" version = "2.32.0"
description = "A python library to define and validate data types in Docling." description = "A python library to define and validate data types in Docling."
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
groups = ["main"] groups = ["main"]
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\"" markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
files = [ files = [
{file = "docling_core-2.31.2-py3-none-any.whl", hash = "sha256:a6db62ac49febcc9e3e24a9acf58e88342ad7f76ab03217b6a3365509eb12eda"}, {file = "docling_core-2.32.0-py3-none-any.whl", hash = "sha256:6c643b45a18c5ed8cecf12d1eeeb7ff677dcfdb24fa4aa88122e3c9cc2aeb58d"},
{file = "docling_core-2.31.2.tar.gz", hash = "sha256:6d61863ce492affc45aa522c291631db0be7c50dc146cb93c42af7ff00bd22a2"}, {file = "docling_core-2.32.0.tar.gz", hash = "sha256:3ec21461f309540bd8bf4880f6c2f0144f6895988102a4204ca5549c76a945c8"},
] ]
[package.dependencies] [package.dependencies]
@ -2640,11 +2640,8 @@ files = [
{file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"}, {file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"},
{file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"}, {file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"},
{file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"},
{file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"}, {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"},
{file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"}, {file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"},
{file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"}, {file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"},

View File

@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions():
print(f"converting {csv_path}") print(f"converting {csv_path}")
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
if csv_path.stem in (
conv_result: ConversionResult = converter.convert(csv_path) "csv-too-few-columns",
"csv-too-many-columns",
"csv-inconsistent-header",
):
with warns(UserWarning, match="Inconsistent column lengths"):
conv_result: ConversionResult = converter.convert(csv_path)
else:
conv_result: ConversionResult = converter.convert(csv_path)
doc: DoclingDocument = conv_result.document doc: DoclingDocument = conv_result.document

View File

@ -38,17 +38,15 @@ def get_converter():
def test_compare_legacy_output(test_doc_paths): def test_compare_legacy_output(test_doc_paths):
converter = get_converter() converter = get_converter()
res = converter.convert_all(test_doc_paths, raises_on_error=True) res = converter.convert_all(test_doc_paths, raises_on_error=True)
for conv_res in res: for conv_res in res:
print(f"Results for {conv_res.input.file}") print(f"Results for {conv_res.input.file}")
print( with pytest.warns(DeprecationWarning, match="Use document instead"):
json.dumps( print(
conv_res.legacy_document.model_dump( json.dumps(
mode="json", by_alias=True, exclude_none=True conv_res.legacy_document.model_dump(
mode="json", by_alias=True, exclude_none=True
)
) )
) )
)
# assert res.legacy_output == res.legacy_output_transformed # assert res.legacy_output == res.legacy_output_transformed

View File

@ -4,6 +4,7 @@ import warnings
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import List, Optional
import pytest
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem, DocItem,
DoclingDocument, DoclingDocument,
@ -302,9 +303,8 @@ def verify_conversion_result_v1(
) )
doc_pred_pages: List[Page] = doc_result.pages doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.legacy_document with pytest.warns(DeprecationWarning, match="Use document instead"):
with warnings.catch_warnings(): doc_pred: DsDocument = doc_result.legacy_document
warnings.simplefilter("ignore", DeprecationWarning)
doc_pred_md = doc_result.legacy_document.export_to_markdown() doc_pred_md = doc_result.legacy_document.export_to_markdown()
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens() doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
@ -391,7 +391,7 @@ def verify_conversion_result_v2(
doc_pred_pages: List[Page] = doc_result.pages doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DoclingDocument = doc_result.document doc_pred: DoclingDocument = doc_result.document
doc_pred_md = doc_result.document.export_to_markdown() doc_pred_md = doc_result.document.export_to_markdown()
doc_pred_dt = doc_result.document.export_to_document_tokens() doc_pred_dt = doc_result.document.export_to_doctags()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"