293 lines
12 KiB
Python
293 lines
12 KiB
Python
import logging
|
|
import warnings
|
|
from pathlib import Path
|
|
from typing import Optional, cast
|
|
|
|
import numpy as np
|
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|
|
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
from docling.datamodel.base_models import AssembledUnit, Page
|
|
from docling.datamodel.document import ConversionResult
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.datamodel.settings import settings
|
|
from docling.models.base_ocr_model import BaseOcrModel
|
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
|
from docling.models.document_picture_classifier import (
|
|
DocumentPictureClassifier,
|
|
DocumentPictureClassifierOptions,
|
|
)
|
|
from docling.models.factories import get_ocr_factory, get_picture_description_factory
|
|
from docling.models.layout_model import LayoutModel
|
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
|
from docling.models.page_preprocessing_model import (
|
|
PagePreprocessingModel,
|
|
PagePreprocessingOptions,
|
|
)
|
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
|
from docling.models.table_structure_model import TableStructureModel
|
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
|
from docling.utils.model_downloader import download_models
|
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
class StandardPdfPipeline(PaginatedPipeline):
|
|
_layout_model_path = LayoutModel._model_path
|
|
_table_model_path = TableStructureModel._model_path
|
|
|
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
|
super().__init__(pipeline_options)
|
|
self.pipeline_options: PdfPipelineOptions
|
|
|
|
artifacts_path: Optional[Path] = None
|
|
if pipeline_options.artifacts_path is not None:
|
|
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
elif settings.artifacts_path is not None:
|
|
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
|
|
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
raise RuntimeError(
|
|
f"The value of {artifacts_path=} is not valid. "
|
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
)
|
|
|
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
self.keep_images = (
|
|
self.pipeline_options.generate_page_images
|
|
or self.pipeline_options.generate_picture_images
|
|
or self.pipeline_options.generate_table_images
|
|
)
|
|
|
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
|
|
|
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
|
|
|
self.build_pipe = [
|
|
# Pre-processing
|
|
PagePreprocessingModel(
|
|
options=PagePreprocessingOptions(
|
|
images_scale=pipeline_options.images_scale,
|
|
)
|
|
),
|
|
# OCR
|
|
ocr_model,
|
|
# Layout model
|
|
LayoutModel(
|
|
artifacts_path=artifacts_path,
|
|
accelerator_options=pipeline_options.accelerator_options,
|
|
options=pipeline_options.layout_options,
|
|
),
|
|
# Table structure model
|
|
TableStructureModel(
|
|
enabled=pipeline_options.do_table_structure,
|
|
artifacts_path=artifacts_path,
|
|
options=pipeline_options.table_structure_options,
|
|
accelerator_options=pipeline_options.accelerator_options,
|
|
),
|
|
# Page assemble
|
|
PageAssembleModel(options=PageAssembleOptions()),
|
|
]
|
|
|
|
# Picture description model
|
|
if (
|
|
picture_description_model := self.get_picture_description_model(
|
|
artifacts_path=artifacts_path
|
|
)
|
|
) is None:
|
|
raise RuntimeError(
|
|
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
|
)
|
|
|
|
self.enrichment_pipe = [
|
|
# Code Formula Enrichment Model
|
|
CodeFormulaModel(
|
|
enabled=pipeline_options.do_code_enrichment
|
|
or pipeline_options.do_formula_enrichment,
|
|
artifacts_path=artifacts_path,
|
|
options=CodeFormulaModelOptions(
|
|
do_code_enrichment=pipeline_options.do_code_enrichment,
|
|
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
|
),
|
|
accelerator_options=pipeline_options.accelerator_options,
|
|
),
|
|
# Document Picture Classifier
|
|
DocumentPictureClassifier(
|
|
enabled=pipeline_options.do_picture_classification,
|
|
artifacts_path=artifacts_path,
|
|
options=DocumentPictureClassifierOptions(),
|
|
accelerator_options=pipeline_options.accelerator_options,
|
|
),
|
|
# Document Picture description
|
|
picture_description_model,
|
|
]
|
|
|
|
if (
|
|
self.pipeline_options.do_formula_enrichment
|
|
or self.pipeline_options.do_code_enrichment
|
|
or self.pipeline_options.do_picture_description
|
|
):
|
|
self.keep_backend = True
|
|
|
|
@staticmethod
|
|
def download_models_hf(
|
|
local_dir: Optional[Path] = None, force: bool = False
|
|
) -> Path:
|
|
warnings.warn(
|
|
"The usage of StandardPdfPipeline.download_models_hf() is deprecated "
|
|
"use instead the utility `docling-tools models download`, or "
|
|
"the upstream method docling.utils.models_downloader.download_all()",
|
|
DeprecationWarning,
|
|
stacklevel=3,
|
|
)
|
|
|
|
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
|
return output_dir
|
|
|
|
def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
|
|
factory = get_ocr_factory(
|
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
)
|
|
return factory.create_instance(
|
|
options=self.pipeline_options.ocr_options,
|
|
enabled=self.pipeline_options.do_ocr,
|
|
artifacts_path=artifacts_path,
|
|
accelerator_options=self.pipeline_options.accelerator_options,
|
|
)
|
|
|
|
def get_picture_description_model(
|
|
self, artifacts_path: Optional[Path] = None
|
|
) -> Optional[PictureDescriptionBaseModel]:
|
|
factory = get_picture_description_factory(
|
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
)
|
|
return factory.create_instance(
|
|
options=self.pipeline_options.picture_description_options,
|
|
enabled=self.pipeline_options.do_picture_description,
|
|
enable_remote_services=self.pipeline_options.enable_remote_services,
|
|
artifacts_path=artifacts_path,
|
|
accelerator_options=self.pipeline_options.accelerator_options,
|
|
)
|
|
|
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
|
with TimeRecorder(conv_res, "page_init"):
|
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
|
if page._backend is not None and page._backend.is_valid():
|
|
page.size = page._backend.get_size()
|
|
|
|
return page
|
|
|
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
all_elements = []
|
|
all_headers = []
|
|
all_body = []
|
|
|
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
|
for p in conv_res.pages:
|
|
if p.assembled is not None:
|
|
for el in p.assembled.body:
|
|
all_body.append(el)
|
|
for el in p.assembled.headers:
|
|
all_headers.append(el)
|
|
for el in p.assembled.elements:
|
|
all_elements.append(el)
|
|
|
|
conv_res.assembled = AssembledUnit(
|
|
elements=all_elements, headers=all_headers, body=all_body
|
|
)
|
|
|
|
conv_res.document = self.reading_order_model(conv_res)
|
|
|
|
# Generate page images in the output
|
|
if self.pipeline_options.generate_page_images:
|
|
for page in conv_res.pages:
|
|
assert page.image is not None
|
|
page_no = page.page_no + 1
|
|
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
|
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
|
)
|
|
|
|
# Generate images of the requested element types
|
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
if (
|
|
self.pipeline_options.generate_picture_images
|
|
or self.pipeline_options.generate_table_images
|
|
):
|
|
scale = self.pipeline_options.images_scale
|
|
for element, _level in conv_res.document.iterate_items():
|
|
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
|
continue
|
|
if (
|
|
isinstance(element, PictureItem)
|
|
and self.pipeline_options.generate_picture_images
|
|
) or (
|
|
isinstance(element, TableItem)
|
|
and self.pipeline_options.generate_table_images
|
|
):
|
|
page_ix = element.prov[0].page_no - 1
|
|
page = next(
|
|
(p for p in conv_res.pages if p.page_no == page_ix),
|
|
cast("Page", None),
|
|
)
|
|
assert page is not None
|
|
assert page.size is not None
|
|
assert page.image is not None
|
|
|
|
crop_bbox = (
|
|
element.prov[0]
|
|
.bbox.scaled(scale=scale)
|
|
.to_top_left_origin(
|
|
page_height=page.size.height * scale
|
|
)
|
|
)
|
|
|
|
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
|
element.image = ImageRef.from_pil(
|
|
cropped_im, dpi=int(72 * scale)
|
|
)
|
|
|
|
# Aggregate confidence values for document:
|
|
if len(conv_res.pages) > 0:
|
|
with warnings.catch_warnings():
|
|
warnings.filterwarnings(
|
|
"ignore",
|
|
category=RuntimeWarning,
|
|
message="Mean of empty slice|All-NaN slice encountered",
|
|
)
|
|
conv_res.confidence.layout_score = float(
|
|
np.nanmean(
|
|
[c.layout_score for c in conv_res.confidence.pages.values()]
|
|
)
|
|
)
|
|
conv_res.confidence.parse_score = float(
|
|
np.nanquantile(
|
|
[c.parse_score for c in conv_res.confidence.pages.values()],
|
|
q=0.1, # parse score should relate to worst 10% of pages.
|
|
)
|
|
)
|
|
conv_res.confidence.table_score = float(
|
|
np.nanmean(
|
|
[c.table_score for c in conv_res.confidence.pages.values()]
|
|
)
|
|
)
|
|
conv_res.confidence.ocr_score = float(
|
|
np.nanmean(
|
|
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
|
)
|
|
)
|
|
|
|
return conv_res
|
|
|
|
@classmethod
|
|
def get_default_options(cls) -> PdfPipelineOptions:
|
|
return PdfPipelineOptions()
|
|
|
|
@classmethod
|
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
return isinstance(backend, PdfDocumentBackend)
|