feat: allow computing page images on-demand with scale and cache them (#36)

* feat: allow computing page images on-demand and cache them Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * feat: expose scale for export of page images and document elements Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix comment Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2024-08-20 13:27:19 +02:00
parent c253dd743a
commit 78347bf679
9 changed files with 104 additions and 77 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -84,7 +84,9 @@ class DoclingParsePageBackend(PdfPageBackend):
            cell_counter += 1
        def draw_clusters_and_cells():
-            image = self.get_page_image()
+            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.bbox.as_tuple()
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
            return merged_cells
        def draw_clusters_and_cells():
-            image = self.get_page_image()
+            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.bbox.as_tuple()
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,10 +1,12 @@
 import copy
 import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, model_validator
+from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Self
 from docling.backend.abstract_backend import PdfPageBackend
@@ -234,14 +236,30 @@ class Page(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    page_no: int
-    page_hash: str = None
+    page_hash: Optional[str] = None
-    size: PageSize = None
+    size: Optional[PageSize] = None
    image: Image = None
    cells: List[Cell] = None
    predictions: PagePredictions = PagePredictions()
-    assembled: AssembledUnit = None
+    assembled: Optional[AssembledUnit] = None
-    _backend: PdfPageBackend = None  # Internal PDF backend
+    _backend: Optional[PdfPageBackend] = (
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
    _image_cache: Dict[float, Image] = (
        {}
    )  # Cache of images in different scales. By default it is cleared during assembling.
    def get_image(self, scale: float = 1.0) -> Optional[Image]:
        if self._backend is None:
            return self._image_cache.get(scale, None)
        if not scale in self._image_cache:
            self._image_cache[scale] = self._backend.get_page_image(scale=scale)
        return self._image_cache[scale]
    @property
    def image(self) -> Optional[Image]:
        return self.get_image(scale=self._default_image_scale)
 class DocumentStream(BaseModel):
@@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):
 class AssembleOptions(BaseModel):
-    keep_page_images: bool = (
+    keep_page_images: Annotated[
-        False  # False: page images are removed in the assemble step
+        bool,
-    )
+        Field(
            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
        ),
    ] = False  # False: page images are removed in the assemble step
    images_scale: Optional[float] = None  # if set, the scale for generated images
    @model_validator(mode="after")
    def set_page_images_from_deprecated(self) -> Self:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", DeprecationWarning)
            default_scale = 1.0
            if self.keep_page_images and self.images_scale is None:
                self.images_scale = default_scale
        return self
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
+from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
 from docling_core.types import BaseCell, BaseText
 from docling_core.types import BoundingBox as DsBoundingBox
@@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
    DocumentStream,
    FigureElement,
    Page,
    PageElement,
    TableElement,
    TextElement,
 )
@@ -302,6 +303,20 @@ class ConvertedDocument(BaseModel):
        else:
            return ""
    def render_element_images(
        self, element_types: Tuple[PageElement] = (FigureElement,)
    ):
        for element in self.assembled.elements:
            if isinstance(element, element_types):
                page_ix = element.page_no
                scale = self.pages[page_ix]._default_image_scale
                crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
                    page_height=self.pages[page_ix].size.height * scale
                )
                cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
                yield element, cropped_im
 class DocumentConversionInput(BaseModel):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -188,10 +188,8 @@ class DocumentConverter:
                    # Free up mem resources before moving on with next batch
                    # Remove page images (can be disabled)
-                    if not self.assemble_options.keep_page_images:
+                    if self.assemble_options.images_scale is None:
-                        assembled_page.image = (
+                        assembled_page._image_cache = {}
                            None  # Comment this if you want to visualize page images
                        )
                    # Unload backend
                    assembled_page._backend.unload()
@@ -231,7 +229,15 @@ class DocumentConverter:
    # Generate the page image and store it in the page object
    def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        page.image = page._backend.get_page_image()
+        # default scale
        page.get_image(scale=1.0)
        # user requested scales
        if self.assemble_options.images_scale is not None:
            page._default_image_scale = self.assemble_options.images_scale
            page.get_image(
                scale=self.assemble_options.images_scale
            )  # this will trigger storing the image in the internal cache
        return page
@@ -247,7 +253,7 @@ class DocumentConverter:
                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
            image.show()
-        # draw_text_boxes(page.image, cells)
+        # draw_text_boxes(page.get_image(scale=1.0), cells)
        return page
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -30,7 +30,7 @@ class EasyOcrModel:
        for page in page_batch:
            # rects = page._fpage.
-            high_res_image = page._backend.get_page_image(scale=self.scale)
+            high_res_image = page.get_image(scale=self.scale)
            im = numpy.array(high_res_image)
            result = self.reader.readtext(im)
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -267,7 +267,9 @@ class LayoutModel:
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
            clusters = []
-            for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
+            for ix, pred_item in enumerate(
                self.layout_predictor.predict(page.get_image(scale=1.0))
            ):
                cluster = Cluster(
                    id=ix,
                    label=pred_item["label"],
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -34,7 +34,9 @@ class TableStructureModel:
            self.scale = 2.0  # Scale up table input images to 144 dpi
    def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
-        image = page._backend.get_page_image()
+        image = (
            page._backend.get_page_image()
        )  # make new image to avoid drawing on the saved ones
        draw = ImageDraw.Draw(image)
        for table_element in tbl_list:
@@ -94,13 +96,7 @@ class TableStructureModel:
                "width": page.size.width * self.scale,
                "height": page.size.height * self.scale,
            }
-            # add image to page input.
+            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
            if self.scale == 1.0:
                page_input["image"] = numpy.asarray(page.image)
            else:  # render new page image on the fly at desired scale
                page_input["image"] = numpy.asarray(
                    page._backend.get_page_image(scale=self.scale)
                )
            table_clusters, table_bboxes = zip(*in_tables)
--- a/examples/export_figures.py
+++ b/examples/export_figures.py
@@ -15,44 +15,7 @@ from docling.document_converter import DocumentConverter
 _log = logging.getLogger(__name__)
-
+IMAGE_RESOLUTION_SCALE = 2.0
 def export_page_images(
    doc: ConvertedDocument,
    output_dir: Path,
 ):
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = doc.input.file.stem
    for page in doc.pages:
        page_no = page.page_no + 1
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.save(fp, format="PNG")
 def export_element_images(
    doc: ConvertedDocument,
    output_dir: Path,
    allowed_element_types: Tuple[PageElement] = (FigureElement,),
 ):
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = doc.input.file.stem
    for element_ix, element in enumerate(doc.assembled.elements):
        if isinstance(element, allowed_element_types):
            page_ix = element.page_no
            crop_bbox = element.cluster.bbox.to_top_left_origin(
                page_height=doc.pages[page_ix].size.height
            )
            cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple())
            element_image_filename = (
                output_dir / f"{doc_filename}-element-{element_ix}.png"
            )
            with element_image_filename.open("wb") as fp:
                cropped_im.save(fp, "PNG")
 def main():
@@ -61,13 +24,16 @@ def main():
    input_doc_paths = [
        Path("./test/data/2206.01062.pdf"),
    ]
    output_dir = Path("./scratch")
    input_files = DocumentConversionInput.from_paths(input_doc_paths)
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    assemble_options = AssembleOptions()
-    assemble_options.keep_page_images = True
+    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
    doc_converter = DocumentConverter(assemble_options=assemble_options)
@@ -75,23 +41,30 @@ def main():
    converted_docs = doc_converter.convert(input_files)
    output_dir.mkdir(parents=True, exist_ok=True)
    for doc in converted_docs:
        if doc.status != ConversionStatus.SUCCESS:
            _log.info(f"Document {doc.input.file} failed to convert.")
            continue
-        # Export page images
+        doc_filename = doc.input.file.stem
        export_page_images(doc, output_dir=Path("./scratch"))
-        # Export figures
+        # Export page images
-        # export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,))
+        for page in doc.pages:
            page_no = page.page_no + 1
            page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
            with page_image_filename.open("wb") as fp:
                page.image.save(fp, format="PNG")
        # Export figures and tables
-        export_element_images(
+        for element, image in doc.render_element_images(
-            doc,
+            element_types=(FigureElement, TableElement)
-            output_dir=Path("./scratch"),
+        ):
-            allowed_element_types=(FigureElement, TableElement),
+            element_image_filename = (
                output_dir / f"{doc_filename}-element-{element.id}.png"
            )
            with element_image_filename.open("wb") as fp:
                image.save(fp, "PNG")
    end_time = time.time() - start_time