
Signed-off-by: Abhishek Kumar <abhishekrocketeer@gmail.com> Testing: (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=10 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf WARNING:docling.pipeline.base_pipeline:Document processing time (24.555 seconds) exceeded the specified timeout of 10.000 seconds INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 36.29 sec. WARNING:docling.cli.main:Document /var/folders/d7/dsfkllxs0xs8x2t4fcjknj4c0000gn/T/tmpl6p08u5i/2206.01062v1.pdf failed to convert. INFO:docling.cli.main:Processed 1 docs, of which 1 failed INFO:docling.cli.main:All documents were converted in 36.29 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=100 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 58.36 sec. INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md INFO:docling.cli.main:Processed 1 docs, of which 0 failed INFO:docling.cli.main:All documents were converted in 58.56 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 59.82 sec. INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md INFO:docling.cli.main:Processed 1 docs, of which 0 failed INFO:docling.cli.main:All documents were converted in 59.88 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling Usage: docling [OPTIONS] source ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ --from [docx|pptx|html|image|pdf|asciido Specify input formats to convert │ │ c|md|xlsx] from. Defaults to all formats. │ │ [default: None] │ │ --to [md|json|html|text|doctags] Specify output formats. Defaults to │ │ Markdown. │ │ [default: None] │ │ --image-export-mode [placeholder|embedded|referenced] Image export mode for the document │ │ (only in case of JSON, Markdown or │ │ HTML). With `placeholder`, only the │ │ position of the image is marked in │ │ the output. In `embedded` mode, the │ │ image is embedded as base64 encoded │ │ string. In `referenced` mode, the │ │ image is exported in PNG format and │ │ referenced from the main exported │ │ document. │ │ [default: embedded] │ │ --ocr --no-ocr If enabled, the bitmap content will │ │ be processed using OCR. │ │ [default: ocr] │ │ --force-ocr --no-force-ocr Replace any existing text with OCR │ │ generated text over the full │ │ content. │ │ [default: no-force-ocr] │ │ --ocr-engine [easyocr|tesseract_cli|tesseract| The OCR engine to use. │ │ ocrmac|rapidocr] [default: easyocr] │ │ --ocr-lang TEXT Provide a comma-separated list of │ │ languages used by the OCR engine. │ │ Note that each OCR engine has │ │ different values for the language │ │ names. │ │ [default: None] │ │ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │ │ [default: dlparse_v2] │ │ --table-mode [fast|accurate] The mode to use in the table │ │ structure model. │ │ [default: fast] │ │ --artifacts-path PATH If provided, the location of the │ │ model artifacts. │ │ [default: None] │ │ --abort-on-error --no-abort-on-error If enabled, the bitmap content will │ │ be processed using OCR. │ │ [default: no-abort-on-error] │ │ --output PATH Output directory where results are │ │ saved. │ │ [default: .] │ │ --verbose -v INTEGER Set the verbosity level. -v for │ │ info logging, -vv for debug │ │ logging. │ │ [default: 0] │ │ --debug-visualize-cells --no-debug-visualize-cells Enable debug output which │ │ visualizes the PDF cells │ │ [default: no-debug-visualize-cells] │ │ --debug-visualize-ocr --no-debug-visualize-ocr Enable debug output which │ │ visualizes the OCR cells │ │ [default: no-debug-visualize-ocr] │ │ --debug-visualize-layout --no-debug-visualize-layout Enable debug output which │ │ visualizes the layour clusters │ │ [default: │ │ no-debug-visualize-layout] │ │ --debug-visualize-tables --no-debug-visualize-tables Enable debug output which │ │ visualizes the table cells │ │ [default: │ │ no-debug-visualize-tables] │ │ --version Show version information. │ │ --document-timeout FLOAT The timeout for processing each │ │ document, in seconds. │ │ [default: None] │ │ --help Show this message and exit. │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
204 lines
7.7 KiB
Python
204 lines
7.7 KiB
Python
import functools
|
|
import logging
|
|
import time
|
|
import traceback
|
|
from abc import ABC, abstractmethod
|
|
from typing import Callable, Iterable, List
|
|
|
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
|
|
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
from docling.datamodel.base_models import (
|
|
ConversionStatus,
|
|
DoclingComponentType,
|
|
ErrorItem,
|
|
Page,
|
|
)
|
|
from docling.datamodel.document import ConversionResult, InputDocument
|
|
from docling.datamodel.pipeline_options import PipelineOptions
|
|
from docling.datamodel.settings import settings
|
|
from docling.models.base_model import BaseEnrichmentModel
|
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
from docling.utils.utils import chunkify
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
class BasePipeline(ABC):
|
|
def __init__(self, pipeline_options: PipelineOptions):
|
|
self.pipeline_options = pipeline_options
|
|
self.build_pipe: List[Callable] = []
|
|
self.enrichment_pipe: List[BaseEnrichmentModel] = []
|
|
|
|
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
|
|
conv_res = ConversionResult(input=in_doc)
|
|
|
|
_log.info(f"Processing document {in_doc.file.name}")
|
|
try:
|
|
with TimeRecorder(
|
|
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
|
):
|
|
# These steps are building and assembling the structure of the
|
|
# output DoclingDocument
|
|
conv_res = self._build_document(conv_res)
|
|
conv_res = self._assemble_document(conv_res)
|
|
# From this stage, all operations should rely only on conv_res.output
|
|
conv_res = self._enrich_document(conv_res)
|
|
conv_res.status = self._determine_status(conv_res)
|
|
except Exception as e:
|
|
conv_res.status = ConversionStatus.FAILURE
|
|
if raises_on_error:
|
|
raise e
|
|
|
|
return conv_res
|
|
|
|
@abstractmethod
|
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
pass
|
|
|
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
return conv_res
|
|
|
|
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
|
|
def _filter_elements(
|
|
doc: DoclingDocument, model: BaseEnrichmentModel
|
|
) -> Iterable[NodeItem]:
|
|
for element, _level in doc.iterate_items():
|
|
if model.is_processable(doc=doc, element=element):
|
|
yield element
|
|
|
|
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
|
for model in self.enrichment_pipe:
|
|
for element_batch in chunkify(
|
|
_filter_elements(conv_res.document, model),
|
|
settings.perf.elements_batch_size,
|
|
):
|
|
# TODO: currently we assume the element itself is modified, because
|
|
# we don't have an interface to save the element back to the document
|
|
for element in model(
|
|
doc=conv_res.document, element_batch=element_batch
|
|
): # Must exhaust!
|
|
pass
|
|
|
|
return conv_res
|
|
|
|
@abstractmethod
|
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
pass
|
|
|
|
@classmethod
|
|
@abstractmethod
|
|
def get_default_options(cls) -> PipelineOptions:
|
|
pass
|
|
|
|
@classmethod
|
|
@abstractmethod
|
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
pass
|
|
|
|
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
|
|
# for model in self.build_pipe:
|
|
# element_batch = model(element_batch)
|
|
#
|
|
# yield from element_batch
|
|
|
|
|
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
|
|
def _apply_on_pages(
|
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
) -> Iterable[Page]:
|
|
for model in self.build_pipe:
|
|
page_batch = model(conv_res, page_batch)
|
|
|
|
yield from page_batch
|
|
|
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
|
|
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
|
raise RuntimeError(
|
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
|
f"Can not convert this with a PDF pipeline. "
|
|
f"Please check your format configuration on DocumentConverter."
|
|
)
|
|
# conv_res.status = ConversionStatus.FAILURE
|
|
# return conv_res
|
|
|
|
total_elapsed_time = 0.0
|
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
|
|
|
for i in range(0, conv_res.input.page_count):
|
|
conv_res.pages.append(Page(page_no=i))
|
|
|
|
try:
|
|
# Iterate batches of pages (page_batch_size) in the doc
|
|
for page_batch in chunkify(
|
|
conv_res.pages, settings.perf.page_batch_size
|
|
):
|
|
start_batch_time = time.monotonic()
|
|
|
|
# 1. Initialise the page resources
|
|
init_pages = map(
|
|
functools.partial(self.initialize_page, conv_res), page_batch
|
|
)
|
|
|
|
# 2. Run pipeline stages
|
|
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
|
|
|
for p in pipeline_pages: # Must exhaust!
|
|
pass
|
|
|
|
end_batch_time = time.monotonic()
|
|
total_elapsed_time += end_batch_time - start_batch_time
|
|
if (
|
|
self.pipeline_options.document_timeout is not None
|
|
and total_elapsed_time > self.pipeline_options.document_timeout
|
|
):
|
|
_log.warning(
|
|
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
|
|
)
|
|
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
|
break
|
|
|
|
_log.debug(
|
|
f"Finished converting page batch time={end_batch_time:.3f}"
|
|
)
|
|
|
|
except Exception as e:
|
|
conv_res.status = ConversionStatus.FAILURE
|
|
trace = "\n".join(traceback.format_exception(e))
|
|
_log.warning(
|
|
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
|
f"{trace}"
|
|
)
|
|
raise e
|
|
|
|
finally:
|
|
# Always unload the PDF backend, even in case of failure
|
|
if conv_res.input._backend:
|
|
conv_res.input._backend.unload()
|
|
|
|
return conv_res
|
|
|
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
status = ConversionStatus.SUCCESS
|
|
for page in conv_res.pages:
|
|
if page._backend is None or not page._backend.is_valid():
|
|
conv_res.errors.append(
|
|
ErrorItem(
|
|
component_type=DoclingComponentType.DOCUMENT_BACKEND,
|
|
module_name=type(page._backend).__name__,
|
|
error_message=f"Page {page.page_no} failed to parse.",
|
|
)
|
|
)
|
|
status = ConversionStatus.PARTIAL_SUCCESS
|
|
|
|
return status
|
|
|
|
# Initialise and load resources for a page
|
|
@abstractmethod
|
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
|
pass
|