feat: allow computing page images on-demand with scale and cache them (#36)

* feat: allow computing page images on-demand and cache them

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* feat: expose scale for export of page images and document elements

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix comment

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-08-20 13:27:19 +02:00 committed by GitHub
parent c253dd743a
commit 78347bf679
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 104 additions and 77 deletions

View File

@ -84,7 +84,9 @@ class DoclingParsePageBackend(PdfPageBackend):
cell_counter += 1 cell_counter += 1
def draw_clusters_and_cells(): def draw_clusters_and_cells():
image = self.get_page_image() image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
for c in cells: for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple() x0, y0, x1, y1 = c.bbox.as_tuple()

View File

@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
return merged_cells return merged_cells
def draw_clusters_and_cells(): def draw_clusters_and_cells():
image = self.get_page_image() image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
for c in cells: for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple() x0, y0, x1, y1 = c.bbox.as_tuple()

View File

@ -1,10 +1,12 @@
import copy import copy
import warnings
from enum import Enum, auto from enum import Enum, auto
from io import BytesIO from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
from PIL.Image import Image from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, model_validator from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
from docling.backend.abstract_backend import PdfPageBackend from docling.backend.abstract_backend import PdfPageBackend
@ -234,14 +236,30 @@ class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)
page_no: int page_no: int
page_hash: str = None page_hash: Optional[str] = None
size: PageSize = None size: Optional[PageSize] = None
image: Image = None
cells: List[Cell] = None cells: List[Cell] = None
predictions: PagePredictions = PagePredictions() predictions: PagePredictions = PagePredictions()
assembled: AssembledUnit = None assembled: Optional[AssembledUnit] = None
_backend: PdfPageBackend = None # Internal PDF backend _backend: Optional[PdfPageBackend] = (
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[float, Image] = (
{}
) # Cache of images in different scales. By default it is cleared during assembling.
def get_image(self, scale: float = 1.0) -> Optional[Image]:
if self._backend is None:
return self._image_cache.get(scale, None)
if not scale in self._image_cache:
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
return self._image_cache[scale]
@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)
class DocumentStream(BaseModel): class DocumentStream(BaseModel):
@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):
class AssembleOptions(BaseModel): class AssembleOptions(BaseModel):
keep_page_images: bool = ( keep_page_images: Annotated[
False # False: page images are removed in the assemble step bool,
) Field(
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
),
] = False # False: page images are removed in the assemble step
images_scale: Optional[float] = None # if set, the scale for generated images
@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0
if self.keep_page_images and self.images_scale is None:
self.images_scale = default_scale
return self

View File

@ -1,7 +1,7 @@
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox from docling_core.types import BoundingBox as DsBoundingBox
@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
DocumentStream, DocumentStream,
FigureElement, FigureElement,
Page, Page,
PageElement,
TableElement, TableElement,
TextElement, TextElement,
) )
@ -302,6 +303,20 @@ class ConvertedDocument(BaseModel):
else: else:
return "" return ""
def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
):
for element in self.assembled.elements:
if isinstance(element, element_types):
page_ix = element.page_no
scale = self.pages[page_ix]._default_image_scale
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
page_height=self.pages[page_ix].size.height * scale
)
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
yield element, cropped_im
class DocumentConversionInput(BaseModel): class DocumentConversionInput(BaseModel):

View File

@ -188,10 +188,8 @@ class DocumentConverter:
# Free up mem resources before moving on with next batch # Free up mem resources before moving on with next batch
# Remove page images (can be disabled) # Remove page images (can be disabled)
if not self.assemble_options.keep_page_images: if self.assemble_options.images_scale is None:
assembled_page.image = ( assembled_page._image_cache = {}
None # Comment this if you want to visualize page images
)
# Unload backend # Unload backend
assembled_page._backend.unload() assembled_page._backend.unload()
@ -231,7 +229,15 @@ class DocumentConverter:
# Generate the page image and store it in the page object # Generate the page image and store it in the page object
def populate_page_images(self, doc: InputDocument, page: Page) -> Page: def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
page.image = page._backend.get_page_image() # default scale
page.get_image(scale=1.0)
# user requested scales
if self.assemble_options.images_scale is not None:
page._default_image_scale = self.assemble_options.images_scale
page.get_image(
scale=self.assemble_options.images_scale
) # this will trigger storing the image in the internal cache
return page return page
@ -247,7 +253,7 @@ class DocumentConverter:
draw.rectangle([(x0, y0), (x1, y1)], outline="red") draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show() image.show()
# draw_text_boxes(page.image, cells) # draw_text_boxes(page.get_image(scale=1.0), cells)
return page return page

View File

@ -30,7 +30,7 @@ class EasyOcrModel:
for page in page_batch: for page in page_batch:
# rects = page._fpage. # rects = page._fpage.
high_res_image = page._backend.get_page_image(scale=self.scale) high_res_image = page.get_image(scale=self.scale)
im = numpy.array(high_res_image) im = numpy.array(high_res_image)
result = self.reader.readtext(im) result = self.reader.readtext(im)

View File

@ -267,7 +267,9 @@ class LayoutModel:
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch: for page in page_batch:
clusters = [] clusters = []
for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)): for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
):
cluster = Cluster( cluster = Cluster(
id=ix, id=ix,
label=pred_item["label"], label=pred_item["label"],

View File

@ -34,7 +34,9 @@ class TableStructureModel:
self.scale = 2.0 # Scale up table input images to 144 dpi self.scale = 2.0 # Scale up table input images to 144 dpi
def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]): def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
image = page._backend.get_page_image() image = (
page._backend.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
for table_element in tbl_list: for table_element in tbl_list:
@ -94,13 +96,7 @@ class TableStructureModel:
"width": page.size.width * self.scale, "width": page.size.width * self.scale,
"height": page.size.height * self.scale, "height": page.size.height * self.scale,
} }
# add image to page input. page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
if self.scale == 1.0:
page_input["image"] = numpy.asarray(page.image)
else: # render new page image on the fly at desired scale
page_input["image"] = numpy.asarray(
page._backend.get_page_image(scale=self.scale)
)
table_clusters, table_bboxes = zip(*in_tables) table_clusters, table_bboxes = zip(*in_tables)

View File

@ -15,44 +15,7 @@ from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0
def export_page_images(
doc: ConvertedDocument,
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = doc.input.file.stem
for page in doc.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")
def export_element_images(
doc: ConvertedDocument,
output_dir: Path,
allowed_element_types: Tuple[PageElement] = (FigureElement,),
):
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = doc.input.file.stem
for element_ix, element in enumerate(doc.assembled.elements):
if isinstance(element, allowed_element_types):
page_ix = element.page_no
crop_bbox = element.cluster.bbox.to_top_left_origin(
page_height=doc.pages[page_ix].size.height
)
cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple())
element_image_filename = (
output_dir / f"{doc_filename}-element-{element_ix}.png"
)
with element_image_filename.open("wb") as fp:
cropped_im.save(fp, "PNG")
def main(): def main():
@ -61,13 +24,16 @@ def main():
input_doc_paths = [ input_doc_paths = [
Path("./test/data/2206.01062.pdf"), Path("./test/data/2206.01062.pdf"),
] ]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths) input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory. # will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions() assemble_options = AssembleOptions()
assemble_options.keep_page_images = True assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options) doc_converter = DocumentConverter(assemble_options=assemble_options)
@ -75,23 +41,30 @@ def main():
converted_docs = doc_converter.convert(input_files) converted_docs = doc_converter.convert(input_files)
output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs: for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS: if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.") _log.info(f"Document {doc.input.file} failed to convert.")
continue continue
# Export page images doc_filename = doc.input.file.stem
export_page_images(doc, output_dir=Path("./scratch"))
# Export figures # Export page images
# export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,)) for page in doc.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")
# Export figures and tables # Export figures and tables
export_element_images( for element, image in doc.render_element_images(
doc, element_types=(FigureElement, TableElement)
output_dir=Path("./scratch"), ):
allowed_element_types=(FigureElement, TableElement), element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
) )
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
end_time = time.time() - start_time end_time = time.time() - start_time