feat: Add pipeline timings and toggle visualization, establish debug settings (#183)
* Add settings to turn visualization on or off Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add profiling code to all models Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Refactor and fix profiling codes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Visualization codes output PNG to debug dir Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for time logging Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Optimize imports Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add start_timestamps to ProfilingItem Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -35,13 +36,16 @@ class BasePipeline(ABC):
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
try:
|
||||
# These steps are building and assembling the structure of the
|
||||
# output DoclingDocument
|
||||
conv_res = self._build_document(in_doc, conv_res)
|
||||
conv_res = self._assemble_document(in_doc, conv_res)
|
||||
# From this stage, all operations should rely only on conv_res.output
|
||||
conv_res = self._enrich_document(in_doc, conv_res)
|
||||
conv_res.status = self._determine_status(in_doc, conv_res)
|
||||
with TimeRecorder(
|
||||
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
||||
):
|
||||
# These steps are building and assembling the structure of the
|
||||
# output DoclingDocument
|
||||
conv_res = self._build_document(conv_res)
|
||||
conv_res = self._assemble_document(conv_res)
|
||||
# From this stage, all operations should rely only on conv_res.output
|
||||
conv_res = self._enrich_document(conv_res)
|
||||
conv_res.status = self._determine_status(conv_res)
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
if raises_on_error:
|
||||
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
|
||||
return conv_res
|
||||
|
||||
@abstractmethod
|
||||
def _build_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
pass
|
||||
|
||||
def _assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
return conv_res
|
||||
|
||||
def _enrich_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
def _filter_elements(
|
||||
doc: DoclingDocument, model: BaseEnrichmentModel
|
||||
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
|
||||
if model.is_processable(doc=doc, element=element):
|
||||
yield element
|
||||
|
||||
for model in self.enrichment_pipe:
|
||||
for element_batch in chunkify(
|
||||
_filter_elements(conv_res.document, model),
|
||||
settings.perf.elements_batch_size,
|
||||
):
|
||||
# TODO: currently we assume the element itself is modified, because
|
||||
# we don't have an interface to save the element back to the document
|
||||
for element in model(
|
||||
doc=conv_res.document, element_batch=element_batch
|
||||
): # Must exhaust!
|
||||
pass
|
||||
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
||||
for model in self.enrichment_pipe:
|
||||
for element_batch in chunkify(
|
||||
_filter_elements(conv_res.document, model),
|
||||
settings.perf.elements_batch_size,
|
||||
):
|
||||
# TODO: currently we assume the element itself is modified, because
|
||||
# we don't have an interface to save the element back to the document
|
||||
for element in model(
|
||||
doc=conv_res.document, element_batch=element_batch
|
||||
): # Must exhaust!
|
||||
pass
|
||||
|
||||
return conv_res
|
||||
|
||||
@abstractmethod
|
||||
def _determine_status(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionStatus:
|
||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@@ -110,66 +107,68 @@ class BasePipeline(ABC):
|
||||
|
||||
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
|
||||
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def _apply_on_pages(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
for model in self.build_pipe:
|
||||
page_batch = model(page_batch)
|
||||
page_batch = model(conv_res, page_batch)
|
||||
|
||||
yield from page_batch
|
||||
|
||||
def _build_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
||||
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
|
||||
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
||||
f"Can not convert this with a PDF pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
||||
start_pb_time = time.time()
|
||||
for i in range(0, conv_res.input.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
functools.partial(self.initialize_page, in_doc), page_batch
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
for page_batch in chunkify(
|
||||
conv_res.pages, settings.perf.page_batch_size
|
||||
):
|
||||
start_pb_time = time.time()
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
functools.partial(self.initialize_page, conv_res), page_batch
|
||||
)
|
||||
|
||||
# 2. Run pipeline stages
|
||||
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
||||
|
||||
for p in pipeline_pages: # Must exhaust!
|
||||
pass
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
trace = "\n".join(traceback.format_exception(e))
|
||||
_log.warning(
|
||||
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
||||
f"{trace}"
|
||||
)
|
||||
raise e
|
||||
|
||||
# 2. Run pipeline stages
|
||||
pipeline_pages = self._apply_on_pages(init_pages)
|
||||
|
||||
for p in pipeline_pages: # Must exhaust!
|
||||
pass
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
trace = "\n".join(traceback.format_exception(e))
|
||||
_log.warning(
|
||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
||||
f"{trace}"
|
||||
)
|
||||
raise e
|
||||
|
||||
finally:
|
||||
# Always unload the PDF backend, even in case of failure
|
||||
if in_doc._backend:
|
||||
in_doc._backend.unload()
|
||||
finally:
|
||||
# Always unload the PDF backend, even in case of failure
|
||||
if conv_res.input._backend:
|
||||
conv_res.input._backend.unload()
|
||||
|
||||
return conv_res
|
||||
|
||||
def _determine_status(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionStatus:
|
||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||
status = ConversionStatus.SUCCESS
|
||||
for page in conv_res.pages:
|
||||
if page._backend is None or not page._backend.is_valid():
|
||||
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
|
||||
# Initialise and load resources for a page
|
||||
@abstractmethod
|
||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user