feat!: Docling v2 (#117)
--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
57
docling/models/page_preprocessing_model.py
Normal file
57
docling/models/page_preprocessing_model.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.models.base_model import BasePageModel
|
||||
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
images_scale: Optional[float]
|
||||
|
||||
|
||||
class PagePreprocessingModel(BasePageModel):
|
||||
def __init__(self, options: PagePreprocessingOptions):
|
||||
self.options = options
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
page = self._populate_page_images(page)
|
||||
page = self._parse_page_cells(page)
|
||||
yield page
|
||||
|
||||
# Generate the page image and store it in the page object
|
||||
def _populate_page_images(self, page: Page) -> Page:
|
||||
# default scale
|
||||
page.get_image(
|
||||
scale=1.0
|
||||
) # puts the page image on the image cache at default scale
|
||||
|
||||
images_scale = self.options.images_scale
|
||||
# user requested scales
|
||||
if images_scale is not None:
|
||||
page._default_image_scale = images_scale
|
||||
page.get_image(
|
||||
scale=images_scale
|
||||
) # this will trigger storing the image in the internal cache
|
||||
|
||||
return page
|
||||
|
||||
# Extract and populate the page cells and store it in the page object
|
||||
def _parse_page_cells(self, page: Page) -> Page:
|
||||
assert page._backend is not None
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells):
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
image.show()
|
||||
|
||||
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
||||
|
||||
return page
|
||||
Reference in New Issue
Block a user