feat: Establish confidence estimation for document and pages (#1313)

* Establish confidence field, propagate layout confidence through

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add OCR confidence and parse confidence (stub)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add parse quality rules, use 5% percentile for overall and parse scores

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Heuristic updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix garbage regex

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Move grade to page

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Introduce mean_score and low_score, consistent aggregate computations

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add confidence test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-05-21 12:32:49 +02:00 committed by GitHub
parent 14d4f5b109
commit 90875247e5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 199 additions and 8 deletions

View File

@ -1,6 +1,9 @@
import math
from collections import defaultdict
from enum import Enum from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Union from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
import numpy as np
from docling_core.types.doc import ( from docling_core.types.doc import (
BoundingBox, BoundingBox,
DocItemLabel, DocItemLabel,
@ -16,7 +19,7 @@ from docling_core.types.io import (
DocumentStream, DocumentStream,
) )
from PIL.Image import Image from PIL.Image import Image
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict, Field, computed_field
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.backend.pdf_backend import PdfPageBackend from docling.backend.pdf_backend import PdfPageBackend
@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
choices: List[OpenAiResponseChoice] choices: List[OpenAiResponseChoice]
created: int created: int
usage: OpenAiResponseUsage usage: OpenAiResponseUsage
# Create a type alias for score values
ScoreValue = float
class QualityGrade(str, Enum):
POOR = "poor"
FAIR = "fair"
GOOD = "good"
EXCELLENT = "excellent"
UNSPECIFIED = "unspecified"
class PageConfidenceScores(BaseModel):
parse_score: ScoreValue = np.nan
layout_score: ScoreValue = np.nan
table_score: ScoreValue = np.nan
ocr_score: ScoreValue = np.nan
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
if score < 0.5:
return QualityGrade.POOR
elif score < 0.8:
return QualityGrade.FAIR
elif score < 0.9:
return QualityGrade.GOOD
elif score >= 0.9:
return QualityGrade.EXCELLENT
return QualityGrade.UNSPECIFIED
@computed_field # type: ignore
@property
def mean_grade(self) -> QualityGrade:
return self._score_to_grade(self.mean_score)
@computed_field # type: ignore
@property
def low_grade(self) -> QualityGrade:
return self._score_to_grade(self.low_score)
@computed_field # type: ignore
@property
def mean_score(self) -> ScoreValue:
return ScoreValue(
np.nanmean(
[
self.ocr_score,
self.table_score,
self.layout_score,
self.parse_score,
]
)
)
@computed_field # type: ignore
@property
def low_score(self) -> ScoreValue:
return ScoreValue(
np.nanquantile(
[
self.ocr_score,
self.table_score,
self.layout_score,
self.parse_score,
],
q=0.05,
)
)
class ConfidenceReport(PageConfidenceScores):
pages: Dict[int, PageConfidenceScores] = Field(
default_factory=lambda: defaultdict(PageConfidenceScores)
)
@computed_field # type: ignore
@property
def mean_score(self) -> ScoreValue:
return ScoreValue(
np.nanmean(
[c.mean_score for c in self.pages.values()],
)
)
@computed_field # type: ignore
@property
def low_score(self) -> ScoreValue:
return ScoreValue(
np.nanmean(
[c.low_score for c in self.pages.values()],
)
)

View File

@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
) )
from docling_core.utils.file import resolve_source_to_stream from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel from pydantic import BaseModel, Field
from typing_extensions import deprecated from typing_extensions import deprecated
from docling.backend.abstract_backend import ( from docling.backend.abstract_backend import (
@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
) )
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ConfidenceReport,
ConversionStatus, ConversionStatus,
DocumentStream, DocumentStream,
ErrorItem, ErrorItem,
@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
pages: List[Page] = [] pages: List[Page] = []
assembled: AssembledUnit = AssembledUnit() assembled: AssembledUnit = AssembledUnit()
timings: Dict[str, ProfilingItem] = {} timings: Dict[str, ProfilingItem] = {}
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
document: DoclingDocument = _EMPTY_DOCLING_DOC document: DoclingDocument = _EMPTY_DOCLING_DOC

View File

@ -5,6 +5,7 @@ from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import numpy as np
from docling_core.types.doc import DocItemLabel from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import Image from PIL import Image
@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
).postprocess() ).postprocess()
# processed_clusters, processed_cells = clusters, page.cells # processed_clusters, processed_cells = clusters, page.cells
conv_res.confidence.pages[page.page_no].layout_score = float(
np.mean([c.confidence for c in processed_clusters])
)
conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean([c.confidence for c in processed_cells if c.from_ocr])
)
page.cells = processed_cells page.cells = processed_cells
page.predictions.layout = LayoutPrediction( page.predictions.layout = LayoutPrediction(
clusters=processed_clusters clusters=processed_clusters

View File

@ -3,6 +3,7 @@ import re
from collections.abc import Iterable from collections.abc import Iterable
from typing import List from typing import List
import numpy as np
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (

View File

@ -1,11 +1,13 @@
import re
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import numpy as np
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page, ScoreValue
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
def __init__(self, options: PagePreprocessingOptions): def __init__(self, options: PagePreprocessingOptions):
self.options = options self.options = options
# Pre-compiled regex patterns for efficiency
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
r"(?:/\w+\s*){2,}"
) # Two or more "/token " sequences
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
if self.options.create_parsed_page: if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page() page.parsed_page = page._backend.get_segmented_page()
# Rate the text quality from the PDF parser, and aggregate on page
text_scores = []
for c in page.cells:
score = self.rate_text_quality(c.text)
text_scores.append(score)
conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanquantile(
text_scores, q=0.10
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
)
# DEBUG code: # DEBUG code:
def draw_text_boxes(image, cells, show: bool = False): def draw_text_boxes(image, cells, show: bool = False):
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
draw_text_boxes(page.get_image(scale=1.0), page.cells) draw_text_boxes(page.get_image(scale=1.0), page.cells)
return page return page
def rate_text_quality(self, text: str) -> float:
# Hard errors: if any of these patterns are found, return 0.0 immediately.
blacklist_chars = ["<EFBFBD>"]
if (
any(text.find(c) >= 0 for c in blacklist_chars)
or self.GLYPH_RE.search(text)
or self.SLASH_G_RE.search(text)
or self.SLASH_NUMBER_GARBAGE_RE.match(
text
) # Check if text is mostly slash-number pattern
):
return 0.0
penalty = 0.0
# Apply a penalty only if the fragmented words pattern occurs at least three times.
frag_matches = self.FRAG_RE.findall(text)
if len(frag_matches) >= 3:
penalty += 0.1 * len(frag_matches)
# Additional heuristic: if the average token length is below 2, add a penalty.
# tokens = text.split()
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
# penalty += 0.2
return max(1.0 - penalty, 0.0)

View File

@ -3,11 +3,12 @@ import warnings
from pathlib import Path from pathlib import Path
from typing import Optional, cast from typing import Optional, cast
import numpy as np
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
or self.pipeline_options.generate_table_images or self.pipeline_options.generate_table_images
) )
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions()) self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path) ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
elements=all_elements, headers=all_headers, body=all_body elements=all_elements, headers=all_headers, body=all_body
) )
conv_res.document = self.glm_model(conv_res) conv_res.document = self.reading_order_model(conv_res)
# Generate page images in the output # Generate page images in the output
if self.pipeline_options.generate_page_images: if self.pipeline_options.generate_page_images:
@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
cropped_im, dpi=int(72 * scale) cropped_im, dpi=int(72 * scale)
) )
# Aggregate confidence values for document:
if len(conv_res.pages) > 0:
conv_res.confidence.layout_score = float(
np.nanmean(
[c.layout_score for c in conv_res.confidence.pages.values()]
)
)
conv_res.confidence.parse_score = float(
np.nanquantile(
[c.parse_score for c in conv_res.confidence.pages.values()],
q=0.1, # parse score should relate to worst 10% of pages.
)
)
conv_res.confidence.table_score = float(
np.nanmean(
[c.table_score for c in conv_res.confidence.pages.values()]
)
)
conv_res.confidence.ocr_score = float(
np.nanmean(
[c.ocr_score for c in conv_res.confidence.pages.values()]
)
)
return conv_res return conv_res
@classmethod @classmethod

View File

@ -7,7 +7,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
@ -163,3 +163,11 @@ def test_parser_backends(test_doc_path):
doc_result: ConversionResult = converter.convert(test_doc_path) doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS assert doc_result.status == ConversionStatus.SUCCESS
def test_confidence(test_doc_path):
converter = DocumentConverter()
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(6, 9))
assert doc_result.confidence.mean_grade == QualityGrade.EXCELLENT
assert doc_result.confidence.low_grade == QualityGrade.EXCELLENT