diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index dbf9366..1e98a97 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,6 +1,9 @@ +import math +from collections import defaultdict from enum import Enum -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union +import numpy as np from docling_core.types.doc import ( BoundingBox, DocItemLabel, @@ -16,7 +19,7 @@ from docling_core.types.io import ( DocumentStream, ) from PIL.Image import Image -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field, computed_field if TYPE_CHECKING: from docling.backend.pdf_backend import PdfPageBackend @@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel): choices: List[OpenAiResponseChoice] created: int usage: OpenAiResponseUsage + + +# Create a type alias for score values +ScoreValue = float + + +class QualityGrade(str, Enum): + POOR = "poor" + FAIR = "fair" + GOOD = "good" + EXCELLENT = "excellent" + UNSPECIFIED = "unspecified" + + +class PageConfidenceScores(BaseModel): + parse_score: ScoreValue = np.nan + layout_score: ScoreValue = np.nan + table_score: ScoreValue = np.nan + ocr_score: ScoreValue = np.nan + + def _score_to_grade(self, score: ScoreValue) -> QualityGrade: + if score < 0.5: + return QualityGrade.POOR + elif score < 0.8: + return QualityGrade.FAIR + elif score < 0.9: + return QualityGrade.GOOD + elif score >= 0.9: + return QualityGrade.EXCELLENT + + return QualityGrade.UNSPECIFIED + + @computed_field # type: ignore + @property + def mean_grade(self) -> QualityGrade: + return self._score_to_grade(self.mean_score) + + @computed_field # type: ignore + @property + def low_grade(self) -> QualityGrade: + return self._score_to_grade(self.low_score) + + @computed_field # type: ignore + @property + def mean_score(self) -> ScoreValue: + return ScoreValue( + np.nanmean( + [ + self.ocr_score, + self.table_score, + self.layout_score, + self.parse_score, + ] + ) + ) + + @computed_field # type: ignore + @property + def low_score(self) -> ScoreValue: + return ScoreValue( + np.nanquantile( + [ + self.ocr_score, + self.table_score, + self.layout_score, + self.parse_score, + ], + q=0.05, + ) + ) + + +class ConfidenceReport(PageConfidenceScores): + pages: Dict[int, PageConfidenceScores] = Field( + default_factory=lambda: defaultdict(PageConfidenceScores) + ) + + @computed_field # type: ignore + @property + def mean_score(self) -> ScoreValue: + return ScoreValue( + np.nanmean( + [c.mean_score for c in self.pages.values()], + ) + ) + + @computed_field # type: ignore + @property + def low_score(self) -> ScoreValue: + return ScoreValue( + np.nanmean( + [c.low_score for c in self.pages.values()], + ) + ) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 984cf02..e464ee3 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import ( ) from docling_core.utils.file import resolve_source_to_stream from docling_core.utils.legacy import docling_document_to_legacy -from pydantic import BaseModel +from pydantic import BaseModel, Field from typing_extensions import deprecated from docling.backend.abstract_backend import ( @@ -56,6 +56,7 @@ from docling.backend.abstract_backend import ( ) from docling.datamodel.base_models import ( AssembledUnit, + ConfidenceReport, ConversionStatus, DocumentStream, ErrorItem, @@ -201,6 +202,7 @@ class ConversionResult(BaseModel): pages: List[Page] = [] assembled: AssembledUnit = AssembledUnit() timings: Dict[str, ProfilingItem] = {} + confidence: ConfidenceReport = Field(default_factory=ConfidenceReport) document: DoclingDocument = _EMPTY_DOCLING_DOC diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index ae37301..e2abb37 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -5,6 +5,7 @@ from collections.abc import Iterable from pathlib import Path from typing import Optional +import numpy as np from docling_core.types.doc import DocItemLabel from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from PIL import Image @@ -184,6 +185,14 @@ class LayoutModel(BasePageModel): ).postprocess() # processed_clusters, processed_cells = clusters, page.cells + conv_res.confidence.pages[page.page_no].layout_score = float( + np.mean([c.confidence for c in processed_clusters]) + ) + + conv_res.confidence.pages[page.page_no].ocr_score = float( + np.mean([c.confidence for c in processed_cells if c.from_ocr]) + ) + page.cells = processed_cells page.predictions.layout = LayoutPrediction( clusters=processed_clusters diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 7153181..bc5589e 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -3,6 +3,7 @@ import re from collections.abc import Iterable from typing import List +import numpy as np from pydantic import BaseModel from docling.datamodel.base_models import ( diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index b45b189..6a1dcf1 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,11 +1,13 @@ +import re from collections.abc import Iterable from pathlib import Path from typing import Optional +import numpy as np from PIL import ImageDraw from pydantic import BaseModel -from docling.datamodel.base_models import Page +from docling.datamodel.base_models import Page, ScoreValue from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel @@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel): def __init__(self, options: PagePreprocessingOptions): self.options = options + # Pre-compiled regex patterns for efficiency + self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>") + self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}") + self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b") + self.SLASH_NUMBER_GARBAGE_RE = re.compile( + r"(?:/\w+\s*){2,}" + ) # Two or more "/token " sequences + def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: @@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel): if self.options.create_parsed_page: page.parsed_page = page._backend.get_segmented_page() + # Rate the text quality from the PDF parser, and aggregate on page + text_scores = [] + for c in page.cells: + score = self.rate_text_quality(c.text) + text_scores.append(score) + + conv_res.confidence.pages[page.page_no].parse_score = float( + np.nanquantile( + text_scores, q=0.10 + ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. + ) + # DEBUG code: def draw_text_boxes(image, cells, show: bool = False): draw = ImageDraw.Draw(image) @@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel): draw_text_boxes(page.get_image(scale=1.0), page.cells) return page + + def rate_text_quality(self, text: str) -> float: + # Hard errors: if any of these patterns are found, return 0.0 immediately. + blacklist_chars = ["�"] + if ( + any(text.find(c) >= 0 for c in blacklist_chars) + or self.GLYPH_RE.search(text) + or self.SLASH_G_RE.search(text) + or self.SLASH_NUMBER_GARBAGE_RE.match( + text + ) # Check if text is mostly slash-number pattern + ): + return 0.0 + + penalty = 0.0 + + # Apply a penalty only if the fragmented words pattern occurs at least three times. + frag_matches = self.FRAG_RE.findall(text) + if len(frag_matches) >= 3: + penalty += 0.1 * len(frag_matches) + + # Additional heuristic: if the average token length is below 2, add a penalty. + # tokens = text.split() + # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2: + # penalty += 0.2 + + return max(1.0 - penalty, 0.0) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index fe93c6c..4269900 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -3,11 +3,12 @@ import warnings from pathlib import Path from typing import Optional, cast +import numpy as np from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import AssembledUnit, Page +from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.settings import settings @@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline): or self.pipeline_options.generate_table_images ) - self.glm_model = ReadingOrderModel(options=ReadingOrderOptions()) + self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) ocr_model = self.get_ocr_model(artifacts_path=artifacts_path) @@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline): elements=all_elements, headers=all_headers, body=all_body ) - conv_res.document = self.glm_model(conv_res) + conv_res.document = self.reading_order_model(conv_res) # Generate page images in the output if self.pipeline_options.generate_page_images: @@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline): cropped_im, dpi=int(72 * scale) ) + # Aggregate confidence values for document: + if len(conv_res.pages) > 0: + conv_res.confidence.layout_score = float( + np.nanmean( + [c.layout_score for c in conv_res.confidence.pages.values()] + ) + ) + conv_res.confidence.parse_score = float( + np.nanquantile( + [c.parse_score for c in conv_res.confidence.pages.values()], + q=0.1, # parse score should relate to worst 10% of pages. + ) + ) + conv_res.confidence.table_score = float( + np.nanmean( + [c.table_score for c in conv_res.confidence.pages.values()] + ) + ) + conv_res.confidence.ocr_score = float( + np.nanmean( + [c.ocr_score for c in conv_res.confidence.pages.values()] + ) + ) + return conv_res @classmethod diff --git a/tests/test_options.py b/tests/test_options.py index 7b0b26d..7addda4 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -7,7 +7,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import ConversionStatus, InputFormat +from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AcceleratorDevice, @@ -163,3 +163,11 @@ def test_parser_backends(test_doc_path): doc_result: ConversionResult = converter.convert(test_doc_path) assert doc_result.status == ConversionStatus.SUCCESS + + +def test_confidence(test_doc_path): + converter = DocumentConverter() + doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(6, 9)) + + assert doc_result.confidence.mean_grade == QualityGrade.EXCELLENT + assert doc_result.confidence.low_grade == QualityGrade.EXCELLENT