Docling/docling/models/page_preprocessing_model.py
Christoph Auer 90875247e5
feat: Establish confidence estimation for document and pages (#1313)
* Establish confidence field, propagate layout confidence through

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add OCR confidence and parse confidence (stub)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add parse quality rules, use 5% percentile for overall and parse scores

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Heuristic updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix garbage regex

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Move grade to page

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Introduce mean_score and low_score, consistent aggregate computations

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add confidence test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-05-21 12:32:49 +02:00

140 lines
4.9 KiB
Python
Raw Blame History

import re
from collections.abc import Iterable
from pathlib import Path
from typing import Optional
import numpy as np
from PIL import ImageDraw
from pydantic import BaseModel
from docling.datamodel.base_models import Page, ScoreValue
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float]
create_parsed_page: bool
class PagePreprocessingModel(BasePageModel):
def __init__(self, options: PagePreprocessingOptions):
self.options = options
# Pre-compiled regex patterns for efficiency
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
r"(?:/\w+\s*){2,}"
) # Two or more "/token " sequences
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "page_parse"):
page = self._populate_page_images(page)
page = self._parse_page_cells(conv_res, page)
yield page
# Generate the page image and store it in the page object
def _populate_page_images(self, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
images_scale = self.options.images_scale
# user requested scales
if images_scale is not None:
page._default_image_scale = images_scale
page.get_image(
scale=images_scale
) # this will trigger storing the image in the internal cache
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None
page.cells = list(page._backend.get_text_cells())
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
# Rate the text quality from the PDF parser, and aggregate on page
text_scores = []
for c in page.cells:
score = self.rate_text_quality(c.text)
text_scores.append(score)
conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanquantile(
text_scores, q=0.10
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
)
# DEBUG code:
def draw_text_boxes(image, cells, show: bool = False):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = (
c.to_bounding_box().l,
c.to_bounding_box().t,
c.to_bounding_box().r,
c.to_bounding_box().b,
)
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = out_path / f"cells_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")
if settings.debug.visualize_cells:
draw_text_boxes(page.get_image(scale=1.0), page.cells)
return page
def rate_text_quality(self, text: str) -> float:
# Hard errors: if any of these patterns are found, return 0.0 immediately.
blacklist_chars = ["<EFBFBD>"]
if (
any(text.find(c) >= 0 for c in blacklist_chars)
or self.GLYPH_RE.search(text)
or self.SLASH_G_RE.search(text)
or self.SLASH_NUMBER_GARBAGE_RE.match(
text
) # Check if text is mostly slash-number pattern
):
return 0.0
penalty = 0.0
# Apply a penalty only if the fragmented words pattern occurs at least three times.
frag_matches = self.FRAG_RE.findall(text)
if len(frag_matches) >= 3:
penalty += 0.1 * len(frag_matches)
# Additional heuristic: if the average token length is below 2, add a penalty.
# tokens = text.split()
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
# penalty += 0.2
return max(1.0 - penalty, 0.0)