feat: Add pipeline timings and toggle visualization, establish debug settings (#183)

* Add settings to turn visualization on or off Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add profiling code to all models Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Refactor and fix profiling codes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Visualization codes output PNG to debug dir Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for time logging Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Optimize imports Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add start_timestamps to ProfilingItem Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2024-10-30 15:04:19 +01:00
parent 94a5290789
commit 2a2c65bf4f
23 changed files with 998 additions and 771 deletions
@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BaseEnrichmentModel
+from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import chunkify

 _log = logging.getLogger(__name__)
@@ -35,13 +36,16 @@ class BasePipeline(ABC):

        _log.info(f"Processing document {in_doc.file.name}")
        try:
-            # These steps are building and assembling the structure of the
-            # output DoclingDocument
-            conv_res = self._build_document(in_doc, conv_res)
-            conv_res = self._assemble_document(in_doc, conv_res)
-            # From this stage, all operations should rely only on conv_res.output
-            conv_res = self._enrich_document(in_doc, conv_res)
-            conv_res.status = self._determine_status(in_doc, conv_res)
+            with TimeRecorder(
+                conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
+            ):
+                # These steps are building and assembling the structure of the
+                # output DoclingDocument
+                conv_res = self._build_document(conv_res)
+                conv_res = self._assemble_document(conv_res)
+                # From this stage, all operations should rely only on conv_res.output
+                conv_res = self._enrich_document(conv_res)
+                conv_res.status = self._determine_status(conv_res)
        except Exception as e:
            conv_res.status = ConversionStatus.FAILURE
            if raises_on_error:
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
        return conv_res

    @abstractmethod
-    def _build_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
        pass

-    def _assemble_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
        return conv_res

-    def _enrich_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:

        def _filter_elements(
            doc: DoclingDocument, model: BaseEnrichmentModel
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
                if model.is_processable(doc=doc, element=element):
                    yield element

-        for model in self.enrichment_pipe:
-            for element_batch in chunkify(
-                _filter_elements(conv_res.document, model),
-                settings.perf.elements_batch_size,
-            ):
-                # TODO: currently we assume the element itself is modified, because
-                # we don't have an interface to save the element back to the document
-                for element in model(
-                    doc=conv_res.document, element_batch=element_batch
-                ):  # Must exhaust!
-                    pass
+        with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
+            for model in self.enrichment_pipe:
+                for element_batch in chunkify(
+                    _filter_elements(conv_res.document, model),
+                    settings.perf.elements_batch_size,
+                ):
+                    # TODO: currently we assume the element itself is modified, because
+                    # we don't have an interface to save the element back to the document
+                    for element in model(
+                        doc=conv_res.document, element_batch=element_batch
+                    ):  # Must exhaust!
+                        pass

        return conv_res

    @abstractmethod
-    def _determine_status(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionStatus:
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        pass

    @classmethod
@@ -110,66 +107,68 @@ class BasePipeline(ABC):

 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.

-    def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def _apply_on_pages(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
        for model in self.build_pipe:
-            page_batch = model(page_batch)
+            page_batch = model(conv_res, page_batch)

        yield from page_batch

-    def _build_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:

-        if not isinstance(in_doc._backend, PdfDocumentBackend):
+        if not isinstance(conv_res.input._backend, PdfDocumentBackend):
            raise RuntimeError(
-                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
                f"Can not convert this with a PDF pipeline. "
                f"Please check your format configuration on DocumentConverter."
            )
            # conv_res.status = ConversionStatus.FAILURE
            # return conv_res

-        for i in range(0, in_doc.page_count):
-            conv_res.pages.append(Page(page_no=i))
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):

-        try:
-            # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
-                start_pb_time = time.time()
+            for i in range(0, conv_res.input.page_count):
+                conv_res.pages.append(Page(page_no=i))

-                # 1. Initialise the page resources
-                init_pages = map(
-                    functools.partial(self.initialize_page, in_doc), page_batch
+            try:
+                # Iterate batches of pages (page_batch_size) in the doc
+                for page_batch in chunkify(
+                    conv_res.pages, settings.perf.page_batch_size
+                ):
+                    start_pb_time = time.time()
+
+                    # 1. Initialise the page resources
+                    init_pages = map(
+                        functools.partial(self.initialize_page, conv_res), page_batch
+                    )
+
+                    # 2. Run pipeline stages
+                    pipeline_pages = self._apply_on_pages(conv_res, init_pages)
+
+                    for p in pipeline_pages:  # Must exhaust!
+                        pass
+
+                    end_pb_time = time.time() - start_pb_time
+                    _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
+
+            except Exception as e:
+                conv_res.status = ConversionStatus.FAILURE
+                trace = "\n".join(traceback.format_exception(e))
+                _log.warning(
+                    f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
+                    f"{trace}"
                )
+                raise e

-                # 2. Run pipeline stages
-                pipeline_pages = self._apply_on_pages(init_pages)
-
-                for p in pipeline_pages:  # Must exhaust!
-                    pass
-
-                end_pb_time = time.time() - start_pb_time
-                _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
-
-        except Exception as e:
-            conv_res.status = ConversionStatus.FAILURE
-            trace = "\n".join(traceback.format_exception(e))
-            _log.warning(
-                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
-                f"{trace}"
-            )
-            raise e
-
-        finally:
-            # Always unload the PDF backend, even in case of failure
-            if in_doc._backend:
-                in_doc._backend.unload()
+            finally:
+                # Always unload the PDF backend, even in case of failure
+                if conv_res.input._backend:
+                    conv_res.input._backend.unload()

        return conv_res

-    def _determine_status(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionStatus:
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        status = ConversionStatus.SUCCESS
        for page in conv_res.pages:
            if page._backend is None or not page._backend.is_valid():
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.

    # Initialise and load resources for a page
    @abstractmethod
-    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
        pass
@@ -5,9 +5,10 @@ from docling.backend.abstract_backend import (
    DeclarativeDocumentBackend,
 )
 from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.pipeline.base_pipeline import BasePipeline
+from docling.utils.profiling import ProfilingScope, TimeRecorder

 _log = logging.getLogger(__name__)

@@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
    def __init__(self, pipeline_options: PipelineOptions):
        super().__init__(pipeline_options)

-    def _build_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:

-        if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
+        if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
            raise RuntimeError(
-                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
                f"Can not convert this with simple pipeline. "
                f"Please check your format configuration on DocumentConverter."
            )
@@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
        # Instead of running a page-level pipeline to build up the document structure,
        # the backend is expected to be of type DeclarativeDocumentBackend, which can output
        # a DoclingDocument straight.
-
-        conv_res.document = in_doc._backend.convert()
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
+            conv_res.document = conv_res.input._backend.convert()
        return conv_res

-    def _determine_status(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionStatus:
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        # This is called only if the previous steps didn't raise.
        # Since we don't have anything else to evaluate, we can
        # safely return SUCCESS.
@@ -7,7 +7,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import AssembledUnit, Page
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PdfPipelineOptions,
@@ -27,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
+from docling.utils.profiling import ProfilingScope, TimeRecorder

 _log = logging.getLogger(__name__)

@@ -119,73 +120,75 @@ class StandardPdfPipeline(PaginatedPipeline):
            )
        return None

-    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
-        page._backend = doc._backend.load_page(page.page_no)  # type: ignore
-        if page._backend is not None and page._backend.is_valid():
-            page.size = page._backend.get_size()
+    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
+        with TimeRecorder(conv_res, "page_init"):
+            page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
+            if page._backend is not None and page._backend.is_valid():
+                page.size = page._backend.get_size()

        return page

-    def _assemble_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
        all_elements = []
        all_headers = []
        all_body = []

-        for p in conv_res.pages:
-            if p.assembled is not None:
-                for el in p.assembled.body:
-                    all_body.append(el)
-                for el in p.assembled.headers:
-                    all_headers.append(el)
-                for el in p.assembled.elements:
-                    all_elements.append(el)
+        with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
+            for p in conv_res.pages:
+                if p.assembled is not None:
+                    for el in p.assembled.body:
+                        all_body.append(el)
+                    for el in p.assembled.headers:
+                        all_headers.append(el)
+                    for el in p.assembled.elements:
+                        all_elements.append(el)

-        conv_res.assembled = AssembledUnit(
-            elements=all_elements, headers=all_headers, body=all_body
-        )
+            conv_res.assembled = AssembledUnit(
+                elements=all_elements, headers=all_headers, body=all_body
+            )

-        conv_res.document = self.glm_model(conv_res)
+            conv_res.document = self.glm_model(conv_res)

-        # Generate page images in the output
-        if self.pipeline_options.generate_page_images:
-            for page in conv_res.pages:
-                assert page.image is not None
-                page_no = page.page_no + 1
-                conv_res.document.pages[page_no].image = ImageRef.from_pil(
-                    page.image, dpi=int(72 * self.pipeline_options.images_scale)
-                )
-
-        # Generate images of the requested element types
-        if (
-            self.pipeline_options.generate_picture_images
-            or self.pipeline_options.generate_table_images
-        ):
-            scale = self.pipeline_options.images_scale
-            for element, _level in conv_res.document.iterate_items():
-                if not isinstance(element, DocItem) or len(element.prov) == 0:
-                    continue
-                if (
-                    isinstance(element, PictureItem)
-                    and self.pipeline_options.generate_picture_images
-                ) or (
-                    isinstance(element, TableItem)
-                    and self.pipeline_options.generate_table_images
-                ):
-                    page_ix = element.prov[0].page_no - 1
-                    page = conv_res.pages[page_ix]
-                    assert page.size is not None
+            # Generate page images in the output
+            if self.pipeline_options.generate_page_images:
+                for page in conv_res.pages:
                    assert page.image is not None
-
-                    crop_bbox = (
-                        element.prov[0]
-                        .bbox.scaled(scale=scale)
-                        .to_top_left_origin(page_height=page.size.height * scale)
+                    page_no = page.page_no + 1
+                    conv_res.document.pages[page_no].image = ImageRef.from_pil(
+                        page.image, dpi=int(72 * self.pipeline_options.images_scale)
                    )

-                    cropped_im = page.image.crop(crop_bbox.as_tuple())
-                    element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
+            # Generate images of the requested element types
+            if (
+                self.pipeline_options.generate_picture_images
+                or self.pipeline_options.generate_table_images
+            ):
+                scale = self.pipeline_options.images_scale
+                for element, _level in conv_res.document.iterate_items():
+                    if not isinstance(element, DocItem) or len(element.prov) == 0:
+                        continue
+                    if (
+                        isinstance(element, PictureItem)
+                        and self.pipeline_options.generate_picture_images
+                    ) or (
+                        isinstance(element, TableItem)
+                        and self.pipeline_options.generate_table_images
+                    ):
+                        page_ix = element.prov[0].page_no - 1
+                        page = conv_res.pages[page_ix]
+                        assert page.size is not None
+                        assert page.image is not None
+
+                        crop_bbox = (
+                            element.prov[0]
+                            .bbox.scaled(scale=scale)
+                            .to_top_left_origin(page_height=page.size.height * scale)
+                        )
+
+                        cropped_im = page.image.crop(crop_bbox.as_tuple())
+                        element.image = ImageRef.from_pil(
+                            cropped_im, dpi=int(72 * scale)
+                        )

        return conv_res