Docling/docling/models/page_preprocessing_model.py
Cesar Berrospi Ramis 3942923125
chore: fix or ignore runtime and deprecation warnings (#1660)
* chore: fix or catch deprecation warnings

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* chore: update poetry lock with latest docling-core

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-05-28 17:55:31 +02:00

145 lines
5.1 KiB
Python
Raw Blame History

import re
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Optional
import numpy as np
from PIL import ImageDraw
from pydantic import BaseModel
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float]
create_parsed_page: bool
class PagePreprocessingModel(BasePageModel):
def __init__(self, options: PagePreprocessingOptions):
self.options = options
# Pre-compiled regex patterns for efficiency
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
r"(?:/\w+\s*){2,}"
) # Two or more "/token " sequences
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "page_parse"):
page = self._populate_page_images(page)
page = self._parse_page_cells(conv_res, page)
yield page
# Generate the page image and store it in the page object
def _populate_page_images(self, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
images_scale = self.options.images_scale
# user requested scales
if images_scale is not None:
page._default_image_scale = images_scale
page.get_image(
scale=images_scale
) # this will trigger storing the image in the internal cache
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None
page.cells = list(page._backend.get_text_cells())
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
# Rate the text quality from the PDF parser, and aggregate on page
text_scores = []
for c in page.cells:
score = self.rate_text_quality(c.text)
text_scores.append(score)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
)
conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanquantile(
text_scores, q=0.10
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
)
# DEBUG code:
def draw_text_boxes(image, cells, show: bool = False):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = (
c.to_bounding_box().l,
c.to_bounding_box().t,
c.to_bounding_box().r,
c.to_bounding_box().b,
)
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = out_path / f"cells_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")
if settings.debug.visualize_cells:
draw_text_boxes(page.get_image(scale=1.0), page.cells)
return page
def rate_text_quality(self, text: str) -> float:
# Hard errors: if any of these patterns are found, return 0.0 immediately.
blacklist_chars = ["<EFBFBD>"]
if (
any(text.find(c) >= 0 for c in blacklist_chars)
or self.GLYPH_RE.search(text)
or self.SLASH_G_RE.search(text)
or self.SLASH_NUMBER_GARBAGE_RE.match(
text
) # Check if text is mostly slash-number pattern
):
return 0.0
penalty = 0.0
# Apply a penalty only if the fragmented words pattern occurs at least three times.
frag_matches = self.FRAG_RE.findall(text)
if len(frag_matches) >= 3:
penalty += 0.1 * len(frag_matches)
# Additional heuristic: if the average token length is below 2, add a penalty.
# tokens = text.split()
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
# penalty += 0.2
return max(1.0 - penalty, 0.0)