feat: Establish confidence estimation for document and pages (#1313)
* Establish confidence field, propagate layout confidence through Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add OCR confidence and parse confidence (stub) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add parse quality rules, use 5% percentile for overall and parse scores Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Heuristic updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix garbage regex Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move grade to page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Introduce mean_score and low_score, consistent aggregate computations Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add confidence test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -3,11 +3,12 @@ import warnings
|
||||
from pathlib import Path
|
||||
from typing import Optional, cast
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
or self.pipeline_options.generate_table_images
|
||||
)
|
||||
|
||||
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||
|
||||
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
||||
|
||||
@@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.document = self.glm_model(conv_res)
|
||||
conv_res.document = self.reading_order_model(conv_res)
|
||||
|
||||
# Generate page images in the output
|
||||
if self.pipeline_options.generate_page_images:
|
||||
@@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
cropped_im, dpi=int(72 * scale)
|
||||
)
|
||||
|
||||
# Aggregate confidence values for document:
|
||||
if len(conv_res.pages) > 0:
|
||||
conv_res.confidence.layout_score = float(
|
||||
np.nanmean(
|
||||
[c.layout_score for c in conv_res.confidence.pages.values()]
|
||||
)
|
||||
)
|
||||
conv_res.confidence.parse_score = float(
|
||||
np.nanquantile(
|
||||
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||
q=0.1, # parse score should relate to worst 10% of pages.
|
||||
)
|
||||
)
|
||||
conv_res.confidence.table_score = float(
|
||||
np.nanmean(
|
||||
[c.table_score for c in conv_res.confidence.pages.values()]
|
||||
)
|
||||
)
|
||||
conv_res.confidence.ocr_score = float(
|
||||
np.nanmean(
|
||||
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
||||
)
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
@classmethod
|
||||
|
||||
Reference in New Issue
Block a user