feat: Establish confidence estimation for document and pages (#1313)
* Establish confidence field, propagate layout confidence through Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add OCR confidence and parse confidence (stub) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add parse quality rules, use 5% percentile for overall and parse scores Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Heuristic updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix garbage regex Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move grade to page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Introduce mean_score and low_score, consistent aggregate computations Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add confidence test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
14d4f5b109
commit
90875247e5
@ -1,6 +1,9 @@
|
|||||||
|
import math
|
||||||
|
from collections import defaultdict
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
BoundingBox,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -16,7 +19,7 @@ from docling_core.types.io import (
|
|||||||
DocumentStream,
|
DocumentStream,
|
||||||
)
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.backend.pdf_backend import PdfPageBackend
|
from docling.backend.pdf_backend import PdfPageBackend
|
||||||
@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
|
|||||||
choices: List[OpenAiResponseChoice]
|
choices: List[OpenAiResponseChoice]
|
||||||
created: int
|
created: int
|
||||||
usage: OpenAiResponseUsage
|
usage: OpenAiResponseUsage
|
||||||
|
|
||||||
|
|
||||||
|
# Create a type alias for score values
|
||||||
|
ScoreValue = float
|
||||||
|
|
||||||
|
|
||||||
|
class QualityGrade(str, Enum):
|
||||||
|
POOR = "poor"
|
||||||
|
FAIR = "fair"
|
||||||
|
GOOD = "good"
|
||||||
|
EXCELLENT = "excellent"
|
||||||
|
UNSPECIFIED = "unspecified"
|
||||||
|
|
||||||
|
|
||||||
|
class PageConfidenceScores(BaseModel):
|
||||||
|
parse_score: ScoreValue = np.nan
|
||||||
|
layout_score: ScoreValue = np.nan
|
||||||
|
table_score: ScoreValue = np.nan
|
||||||
|
ocr_score: ScoreValue = np.nan
|
||||||
|
|
||||||
|
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
||||||
|
if score < 0.5:
|
||||||
|
return QualityGrade.POOR
|
||||||
|
elif score < 0.8:
|
||||||
|
return QualityGrade.FAIR
|
||||||
|
elif score < 0.9:
|
||||||
|
return QualityGrade.GOOD
|
||||||
|
elif score >= 0.9:
|
||||||
|
return QualityGrade.EXCELLENT
|
||||||
|
|
||||||
|
return QualityGrade.UNSPECIFIED
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def mean_grade(self) -> QualityGrade:
|
||||||
|
return self._score_to_grade(self.mean_score)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def low_grade(self) -> QualityGrade:
|
||||||
|
return self._score_to_grade(self.low_score)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def mean_score(self) -> ScoreValue:
|
||||||
|
return ScoreValue(
|
||||||
|
np.nanmean(
|
||||||
|
[
|
||||||
|
self.ocr_score,
|
||||||
|
self.table_score,
|
||||||
|
self.layout_score,
|
||||||
|
self.parse_score,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def low_score(self) -> ScoreValue:
|
||||||
|
return ScoreValue(
|
||||||
|
np.nanquantile(
|
||||||
|
[
|
||||||
|
self.ocr_score,
|
||||||
|
self.table_score,
|
||||||
|
self.layout_score,
|
||||||
|
self.parse_score,
|
||||||
|
],
|
||||||
|
q=0.05,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ConfidenceReport(PageConfidenceScores):
|
||||||
|
pages: Dict[int, PageConfidenceScores] = Field(
|
||||||
|
default_factory=lambda: defaultdict(PageConfidenceScores)
|
||||||
|
)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def mean_score(self) -> ScoreValue:
|
||||||
|
return ScoreValue(
|
||||||
|
np.nanmean(
|
||||||
|
[c.mean_score for c in self.pages.values()],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def low_score(self) -> ScoreValue:
|
||||||
|
return ScoreValue(
|
||||||
|
np.nanmean(
|
||||||
|
[c.low_score for c in self.pages.values()],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
|
|||||||
)
|
)
|
||||||
from docling_core.utils.file import resolve_source_to_stream
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
from docling_core.utils.legacy import docling_document_to_legacy
|
from docling_core.utils.legacy import docling_document_to_legacy
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, Field
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docling.backend.abstract_backend import (
|
||||||
@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
AssembledUnit,
|
AssembledUnit,
|
||||||
|
ConfidenceReport,
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
ErrorItem,
|
ErrorItem,
|
||||||
@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
|
|||||||
pages: List[Page] = []
|
pages: List[Page] = []
|
||||||
assembled: AssembledUnit = AssembledUnit()
|
assembled: AssembledUnit = AssembledUnit()
|
||||||
timings: Dict[str, ProfilingItem] = {}
|
timings: Dict[str, ProfilingItem] = {}
|
||||||
|
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
||||||
|
|
||||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ from collections.abc import Iterable
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from docling_core.types.doc import DocItemLabel
|
from docling_core.types.doc import DocItemLabel
|
||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
|
|||||||
).postprocess()
|
).postprocess()
|
||||||
# processed_clusters, processed_cells = clusters, page.cells
|
# processed_clusters, processed_cells = clusters, page.cells
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].layout_score = float(
|
||||||
|
np.mean([c.confidence for c in processed_clusters])
|
||||||
|
)
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
||||||
|
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
||||||
|
)
|
||||||
|
|
||||||
page.cells = processed_cells
|
page.cells = processed_cells
|
||||||
page.predictions.layout = LayoutPrediction(
|
page.predictions.layout = LayoutPrediction(
|
||||||
clusters=processed_clusters
|
clusters=processed_clusters
|
||||||
|
@ -3,6 +3,7 @@ import re
|
|||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
|
import re
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page, ScoreValue
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
def __init__(self, options: PagePreprocessingOptions):
|
def __init__(self, options: PagePreprocessingOptions):
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
|
# Pre-compiled regex patterns for efficiency
|
||||||
|
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
||||||
|
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
||||||
|
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
||||||
|
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
||||||
|
r"(?:/\w+\s*){2,}"
|
||||||
|
) # Two or more "/token " sequences
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
if self.options.create_parsed_page:
|
if self.options.create_parsed_page:
|
||||||
page.parsed_page = page._backend.get_segmented_page()
|
page.parsed_page = page._backend.get_segmented_page()
|
||||||
|
|
||||||
|
# Rate the text quality from the PDF parser, and aggregate on page
|
||||||
|
text_scores = []
|
||||||
|
for c in page.cells:
|
||||||
|
score = self.rate_text_quality(c.text)
|
||||||
|
text_scores.append(score)
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].parse_score = float(
|
||||||
|
np.nanquantile(
|
||||||
|
text_scores, q=0.10
|
||||||
|
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
||||||
|
)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_text_boxes(image, cells, show: bool = False):
|
def draw_text_boxes(image, cells, show: bool = False):
|
||||||
draw = ImageDraw.Draw(image)
|
draw = ImageDraw.Draw(image)
|
||||||
@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
def rate_text_quality(self, text: str) -> float:
|
||||||
|
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
||||||
|
blacklist_chars = ["<EFBFBD>"]
|
||||||
|
if (
|
||||||
|
any(text.find(c) >= 0 for c in blacklist_chars)
|
||||||
|
or self.GLYPH_RE.search(text)
|
||||||
|
or self.SLASH_G_RE.search(text)
|
||||||
|
or self.SLASH_NUMBER_GARBAGE_RE.match(
|
||||||
|
text
|
||||||
|
) # Check if text is mostly slash-number pattern
|
||||||
|
):
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
penalty = 0.0
|
||||||
|
|
||||||
|
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
||||||
|
frag_matches = self.FRAG_RE.findall(text)
|
||||||
|
if len(frag_matches) >= 3:
|
||||||
|
penalty += 0.1 * len(frag_matches)
|
||||||
|
|
||||||
|
# Additional heuristic: if the average token length is below 2, add a penalty.
|
||||||
|
# tokens = text.split()
|
||||||
|
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
||||||
|
# penalty += 0.2
|
||||||
|
|
||||||
|
return max(1.0 - penalty, 0.0)
|
||||||
|
@ -3,11 +3,12 @@ import warnings
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, cast
|
from typing import Optional, cast
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
or self.pipeline_options.generate_table_images
|
or self.pipeline_options.generate_table_images
|
||||||
)
|
)
|
||||||
|
|
||||||
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
||||||
|
|
||||||
@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
elements=all_elements, headers=all_headers, body=all_body
|
elements=all_elements, headers=all_headers, body=all_body
|
||||||
)
|
)
|
||||||
|
|
||||||
conv_res.document = self.glm_model(conv_res)
|
conv_res.document = self.reading_order_model(conv_res)
|
||||||
|
|
||||||
# Generate page images in the output
|
# Generate page images in the output
|
||||||
if self.pipeline_options.generate_page_images:
|
if self.pipeline_options.generate_page_images:
|
||||||
@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
cropped_im, dpi=int(72 * scale)
|
cropped_im, dpi=int(72 * scale)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Aggregate confidence values for document:
|
||||||
|
if len(conv_res.pages) > 0:
|
||||||
|
conv_res.confidence.layout_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.layout_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.parse_score = float(
|
||||||
|
np.nanquantile(
|
||||||
|
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||||
|
q=0.1, # parse score should relate to worst 10% of pages.
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.table_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.table_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.ocr_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -7,7 +7,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
@ -163,3 +163,11 @@ def test_parser_backends(test_doc_path):
|
|||||||
doc_result: ConversionResult = converter.convert(test_doc_path)
|
doc_result: ConversionResult = converter.convert(test_doc_path)
|
||||||
|
|
||||||
assert doc_result.status == ConversionStatus.SUCCESS
|
assert doc_result.status == ConversionStatus.SUCCESS
|
||||||
|
|
||||||
|
|
||||||
|
def test_confidence(test_doc_path):
|
||||||
|
converter = DocumentConverter()
|
||||||
|
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(6, 9))
|
||||||
|
|
||||||
|
assert doc_result.confidence.mean_grade == QualityGrade.EXCELLENT
|
||||||
|
assert doc_result.confidence.low_grade == QualityGrade.EXCELLENT
|
||||||
|
Loading…
Reference in New Issue
Block a user