Docling/docling/pipeline/base_pipeline.py
Abhishek Kumar 3da166eafa
feat: Add timeout limit to document parsing job. DS4SD#270 (#552)
Signed-off-by: Abhishek Kumar <abhishekrocketeer@gmail.com>

Testing:
(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=10 --verbose
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
WARNING:docling.pipeline.base_pipeline:Document processing time (24.555 seconds) exceeded the specified timeout of 10.000 seconds
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 36.29 sec.
WARNING:docling.cli.main:Document /var/folders/d7/dsfkllxs0xs8x2t4fcjknj4c0000gn/T/tmpl6p08u5i/2206.01062v1.pdf failed to convert.
INFO:docling.cli.main:Processed 1 docs, of which 1 failed
INFO:docling.cli.main:All documents were converted in 36.29 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=100 --verbose
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 58.36 sec.
INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md
INFO:docling.cli.main:Processed 1 docs, of which 0 failed
INFO:docling.cli.main:All documents were converted in 58.56 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --verbose
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 59.82 sec.
INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md
INFO:docling.cli.main:Processed 1 docs, of which 0 failed
INFO:docling.cli.main:All documents were converted in 59.88 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling

 Usage: docling [OPTIONS] source

╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None] [required]        │
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --from                                                       [docx|pptx|html|image|pdf|asciido  Specify input formats to convert    │
│                                                              c|md|xlsx]                         from. Defaults to all formats.      │
│                                                                                                 [default: None]                     │
│ --to                                                         [md|json|html|text|doctags]        Specify output formats. Defaults to │
│                                                                                                 Markdown.                           │
│                                                                                                 [default: None]                     │
│ --image-export-mode                                          [placeholder|embedded|referenced]  Image export mode for the document  │
│                                                                                                 (only in case of JSON, Markdown or  │
│                                                                                                 HTML). With `placeholder`, only the │
│                                                                                                 position of the image is marked in  │
│                                                                                                 the output. In `embedded` mode, the │
│                                                                                                 image is embedded as base64 encoded │
│                                                                                                 string. In `referenced` mode, the   │
│                                                                                                 image is exported in PNG format and │
│                                                                                                 referenced from the main exported   │
│                                                                                                 document.                           │
│                                                                                                 [default: embedded]                 │
│ --ocr                         --no-ocr                                                          If enabled, the bitmap content will │
│                                                                                                 be processed using OCR.             │
│                                                                                                 [default: ocr]                      │
│ --force-ocr                   --no-force-ocr                                                    Replace any existing text with OCR  │
│                                                                                                 generated text over the full        │
│                                                                                                 content.                            │
│                                                                                                 [default: no-force-ocr]             │
│ --ocr-engine                                                 [easyocr|tesseract_cli|tesseract|  The OCR engine to use.              │
│                                                              ocrmac|rapidocr]                   [default: easyocr]                  │
│ --ocr-lang                                                   TEXT                               Provide a comma-separated list of   │
│                                                                                                 languages used by the OCR engine.   │
│                                                                                                 Note that each OCR engine has       │
│                                                                                                 different values for the language   │
│                                                                                                 names.                              │
│                                                                                                 [default: None]                     │
│ --pdf-backend                                                [pypdfium2|dlparse_v1|dlparse_v2]  The PDF backend to use.             │
│                                                                                                 [default: dlparse_v2]               │
│ --table-mode                                                 [fast|accurate]                    The mode to use in the table        │
│                                                                                                 structure model.                    │
│                                                                                                 [default: fast]                     │
│ --artifacts-path                                             PATH                               If provided, the location of the    │
│                                                                                                 model artifacts.                    │
│                                                                                                 [default: None]                     │
│ --abort-on-error              --no-abort-on-error                                               If enabled, the bitmap content will │
│                                                                                                 be processed using OCR.             │
│                                                                                                 [default: no-abort-on-error]        │
│ --output                                                     PATH                               Output directory where results are  │
│                                                                                                 saved.                              │
│                                                                                                 [default: .]                        │
│ --verbose                 -v                                 INTEGER                            Set the verbosity level. -v for     │
│                                                                                                 info logging, -vv for debug         │
│                                                                                                 logging.                            │
│                                                                                                 [default: 0]                        │
│ --debug-visualize-cells       --no-debug-visualize-cells                                        Enable debug output which           │
│                                                                                                 visualizes the PDF cells            │
│                                                                                                 [default: no-debug-visualize-cells] │
│ --debug-visualize-ocr         --no-debug-visualize-ocr                                          Enable debug output which           │
│                                                                                                 visualizes the OCR cells            │
│                                                                                                 [default: no-debug-visualize-ocr]   │
│ --debug-visualize-layout      --no-debug-visualize-layout                                       Enable debug output which           │
│                                                                                                 visualizes the layour clusters      │
│                                                                                                 [default:                           │
│                                                                                                 no-debug-visualize-layout]          │
│ --debug-visualize-tables      --no-debug-visualize-tables                                       Enable debug output which           │
│                                                                                                 visualizes the table cells          │
│                                                                                                 [default:                           │
│                                                                                                 no-debug-visualize-tables]          │
│ --version                                                                                       Show version information.           │
│ --document-timeout                                           FLOAT                              The timeout for processing each     │
│                                                                                                 document, in seconds.               │
│                                                                                                 [default: None]                     │
│ --help                                                                                          Show this message and exit.         │
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
2024-12-11 15:06:10 +01:00

204 lines
7.7 KiB
Python

import functools
import logging
import time
import traceback
from abc import ABC, abstractmethod
from typing import Callable, Iterable, List
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
class BasePipeline(ABC):
def __init__(self, pipeline_options: PipelineOptions):
self.pipeline_options = pipeline_options
self.build_pipe: List[Callable] = []
self.enrichment_pipe: List[BaseEnrichmentModel] = []
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
try:
with TimeRecorder(
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
):
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(conv_res)
conv_res = self._assemble_document(conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(conv_res)
conv_res.status = self._determine_status(conv_res)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
if raises_on_error:
raise e
return conv_res
@abstractmethod
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
pass
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
return conv_res
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
def _filter_elements(
doc: DoclingDocument, model: BaseEnrichmentModel
) -> Iterable[NodeItem]:
for element, _level in doc.iterate_items():
if model.is_processable(doc=doc, element=element):
yield element
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
for model in self.enrichment_pipe:
for element_batch in chunkify(
_filter_elements(conv_res.document, model),
settings.perf.elements_batch_size,
):
# TODO: currently we assume the element itself is modified, because
# we don't have an interface to save the element back to the document
for element in model(
doc=conv_res.document, element_batch=element_batch
): # Must exhaust!
pass
return conv_res
@abstractmethod
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
pass
@classmethod
@abstractmethod
def get_default_options(cls) -> PipelineOptions:
pass
@classmethod
@abstractmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
pass
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
# for model in self.build_pipe:
# element_batch = model(element_batch)
#
# yield from element_batch
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
def _apply_on_pages(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for model in self.build_pipe:
page_batch = model(conv_res, page_batch)
yield from page_batch
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
raise RuntimeError(
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
f"Can not convert this with a PDF pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
total_elapsed_time = 0.0
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
for i in range(0, conv_res.input.page_count):
conv_res.pages.append(Page(page_no=i))
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(
conv_res.pages, settings.perf.page_batch_size
):
start_batch_time = time.monotonic()
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, conv_res), page_batch
)
# 2. Run pipeline stages
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
for p in pipeline_pages: # Must exhaust!
pass
end_batch_time = time.monotonic()
total_elapsed_time += end_batch_time - start_batch_time
if (
self.pipeline_options.document_timeout is not None
and total_elapsed_time > self.pipeline_options.document_timeout
):
_log.warning(
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
)
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
break
_log.debug(
f"Finished converting page batch time={end_batch_time:.3f}"
)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.warning(
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
f"{trace}"
)
raise e
finally:
# Always unload the PDF backend, even in case of failure
if conv_res.input._backend:
conv_res.input._backend.unload()
return conv_res
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if page._backend is None or not page._backend.is_valid():
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.DOCUMENT_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
return status
# Initialise and load resources for a page
@abstractmethod
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
pass