feat: Add pipeline timings and toggle visualization, establish debug settings (#183)

* Add settings to turn visualization on or off

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add profiling code to all models

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Refactor and fix profiling codes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Visualization codes output PNG to debug dir

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for time logging

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Optimize imports

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add start_timestamps to ProfilingItem

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-30 15:04:19 +01:00
committed by GitHub
parent 94a5290789
commit 2a2c65bf4f
23 changed files with 998 additions and 771 deletions

View File

@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
@@ -35,13 +36,16 @@ class BasePipeline(ABC):
_log.info(f"Processing document {in_doc.file.name}")
try:
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(in_doc, conv_res)
conv_res = self._assemble_document(in_doc, conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(in_doc, conv_res)
conv_res.status = self._determine_status(in_doc, conv_res)
with TimeRecorder(
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
):
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(conv_res)
conv_res = self._assemble_document(conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(conv_res)
conv_res.status = self._determine_status(conv_res)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
if raises_on_error:
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
return conv_res
@abstractmethod
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
pass
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
return conv_res
def _enrich_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
def _filter_elements(
doc: DoclingDocument, model: BaseEnrichmentModel
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
if model.is_processable(doc=doc, element=element):
yield element
for model in self.enrichment_pipe:
for element_batch in chunkify(
_filter_elements(conv_res.document, model),
settings.perf.elements_batch_size,
):
# TODO: currently we assume the element itself is modified, because
# we don't have an interface to save the element back to the document
for element in model(
doc=conv_res.document, element_batch=element_batch
): # Must exhaust!
pass
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
for model in self.enrichment_pipe:
for element_batch in chunkify(
_filter_elements(conv_res.document, model),
settings.perf.elements_batch_size,
):
# TODO: currently we assume the element itself is modified, because
# we don't have an interface to save the element back to the document
for element in model(
doc=conv_res.document, element_batch=element_batch
): # Must exhaust!
pass
return conv_res
@abstractmethod
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
pass
@classmethod
@@ -110,66 +107,68 @@ class BasePipeline(ABC):
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def _apply_on_pages(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for model in self.build_pipe:
page_batch = model(page_batch)
page_batch = model(conv_res, page_batch)
yield from page_batch
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(in_doc._backend, PdfDocumentBackend):
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
f"Can not convert this with a PDF pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
for i in range(0, conv_res.input.page_count):
conv_res.pages.append(Page(page_no=i))
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, in_doc), page_batch
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(
conv_res.pages, settings.perf.page_batch_size
):
start_pb_time = time.time()
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, conv_res), page_batch
)
# 2. Run pipeline stages
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
for p in pipeline_pages: # Must exhaust!
pass
end_pb_time = time.time() - start_pb_time
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.warning(
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
f"{trace}"
)
raise e
# 2. Run pipeline stages
pipeline_pages = self._apply_on_pages(init_pages)
for p in pipeline_pages: # Must exhaust!
pass
end_pb_time = time.time() - start_pb_time
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.warning(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
raise e
finally:
# Always unload the PDF backend, even in case of failure
if in_doc._backend:
in_doc._backend.unload()
finally:
# Always unload the PDF backend, even in case of failure
if conv_res.input._backend:
conv_res.input._backend.unload()
return conv_res
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if page._backend is None or not page._backend.is_valid():
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
# Initialise and load resources for a page
@abstractmethod
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
pass