feat: Establish confidence estimation for document and pages (#1313)

* Establish confidence field, propagate layout confidence through Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add OCR confidence and parse confidence (stub) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add parse quality rules, use 5% percentile for overall and parse scores Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Heuristic updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix garbage regex Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move grade to page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Introduce mean_score and low_score, consistent aggregate computations Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add confidence test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-05-21 12:32:49 +02:00
parent 14d4f5b109
commit 90875247e5
7 changed files with 199 additions and 8 deletions
@@ -1,11 +1,13 @@
+import re
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional

+import numpy as np
 from PIL import ImageDraw
 from pydantic import BaseModel

-from docling.datamodel.base_models import Page
+from docling.datamodel.base_models import Page, ScoreValue
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
@@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
    def __init__(self, options: PagePreprocessingOptions):
        self.options = options

+        # Pre-compiled regex patterns for efficiency
+        self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
+        self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
+        self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
+        self.SLASH_NUMBER_GARBAGE_RE = re.compile(
+            r"(?:/\w+\s*){2,}"
+        )  # Two or more "/token " sequences
+
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
        if self.options.create_parsed_page:
            page.parsed_page = page._backend.get_segmented_page()

+        # Rate the text quality from the PDF parser, and aggregate on page
+        text_scores = []
+        for c in page.cells:
+            score = self.rate_text_quality(c.text)
+            text_scores.append(score)
+
+        conv_res.confidence.pages[page.page_no].parse_score = float(
+            np.nanquantile(
+                text_scores, q=0.10
+            )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
+        )
+
        # DEBUG code:
        def draw_text_boxes(image, cells, show: bool = False):
            draw = ImageDraw.Draw(image)
@@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
            draw_text_boxes(page.get_image(scale=1.0), page.cells)

        return page
+
+    def rate_text_quality(self, text: str) -> float:
+        # Hard errors: if any of these patterns are found, return 0.0 immediately.
+        blacklist_chars = ["<EFBFBD>"]
+        if (
+            any(text.find(c) >= 0 for c in blacklist_chars)
+            or self.GLYPH_RE.search(text)
+            or self.SLASH_G_RE.search(text)
+            or self.SLASH_NUMBER_GARBAGE_RE.match(
+                text
+            )  # Check if text is mostly slash-number pattern
+        ):
+            return 0.0
+
+        penalty = 0.0
+
+        # Apply a penalty only if the fragmented words pattern occurs at least three times.
+        frag_matches = self.FRAG_RE.findall(text)
+        if len(frag_matches) >= 3:
+            penalty += 0.1 * len(frag_matches)
+
+        # Additional heuristic: if the average token length is below 2, add a penalty.
+        # tokens = text.split()
+        # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
+        #    penalty += 0.2
+
+        return max(1.0 - penalty, 0.0)