
* Upgraded Layout Postprocessing, sending old code back to ERZ Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Implement hierachical cluster layout processing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested cluster processing through full pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested clusters through GLM as payload Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move to_docling_document from ds-glm to this repo Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * feat(Accelerator): Introduce options to control the num_threads and device from API, envvars, CLI. - Introduce the AcceleratorOptions, AcceleratorDevice and use them to set the device where the models run. - Introduce the accelerator_utils with function to decide the device and resolve the AUTO setting. - Refactor the way how the docling-ibm-models are called to match the new init signature of models. - Translate the accelerator options to the specific inputs for third-party models. - Extend the docling CLI with parameters to set the num_threads and device. - Add new unit tests. - Write new example how to use the accelerator options. * fix: Improve the pydantic objects in the pipeline_options and imports. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: TableStructureModel: Refactor the artifacts path to use the new structure for fast/accurate model Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Updated test ground-truth Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated test ground-truth (again), bugfix for empty layout Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Do proper check to set the device in EasyOCR, RapidOCR. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Correct the way to set GPU for EasyOCR, RapidOCR Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Ocr AccleratorDevice Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Merge pull request #556 from DS4SD/cau/layout-processing-improvement feat: layout processing improvements and bugfixes * Update lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update HF model ref, reset test generate Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Repin to release package versions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Many layout processing improvements, add document index type Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update pinnings to docling-core Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test GT Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix table box snapping Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for cluster pre-ordering Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Introduce OCR confidence, propagate to orphan in post-processing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix form and key value area groups Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Adjust confidence in EasyOcr Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Roll back CLI changes from main Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test GT Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update docling-core pinning Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Annoying fixes for historical python versions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated test GT for legacy Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Comment cleanup Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
235 lines
9.6 KiB
Python
235 lines
9.6 KiB
Python
import copy
|
|
import logging
|
|
import random
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Iterable, List
|
|
|
|
from docling_core.types.doc import CoordOrigin, DocItemLabel
|
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
from docling.datamodel.base_models import (
|
|
BoundingBox,
|
|
Cell,
|
|
Cluster,
|
|
LayoutPrediction,
|
|
Page,
|
|
)
|
|
from docling.datamodel.document import ConversionResult
|
|
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
|
from docling.datamodel.settings import settings
|
|
from docling.models.base_model import BasePageModel
|
|
from docling.utils.accelerator_utils import decide_device
|
|
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
|
from docling.utils.profiling import TimeRecorder
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
class LayoutModel(BasePageModel):
|
|
|
|
TEXT_ELEM_LABELS = [
|
|
DocItemLabel.TEXT,
|
|
DocItemLabel.FOOTNOTE,
|
|
DocItemLabel.CAPTION,
|
|
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
DocItemLabel.CHECKBOX_SELECTED,
|
|
DocItemLabel.SECTION_HEADER,
|
|
DocItemLabel.PAGE_HEADER,
|
|
DocItemLabel.PAGE_FOOTER,
|
|
DocItemLabel.CODE,
|
|
DocItemLabel.LIST_ITEM,
|
|
# "Formula",
|
|
]
|
|
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
|
|
|
TABLE_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
|
|
FIGURE_LABEL = DocItemLabel.PICTURE
|
|
FORMULA_LABEL = DocItemLabel.FORMULA
|
|
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
|
|
|
|
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
|
|
device = decide_device(accelerator_options.device)
|
|
|
|
self.layout_predictor = LayoutPredictor(
|
|
artifact_path=str(artifacts_path),
|
|
device=device,
|
|
num_threads=accelerator_options.num_threads,
|
|
)
|
|
|
|
def draw_clusters_and_cells_side_by_side(
|
|
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
|
|
):
|
|
"""
|
|
Draws a page image side by side with clusters filtered into two categories:
|
|
- Left: Clusters excluding FORM, KEY_VALUE_REGION, and PICTURE.
|
|
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
|
|
Includes label names and confidence scores for each cluster.
|
|
"""
|
|
label_to_color = {
|
|
DocItemLabel.TEXT: (255, 255, 153), # Light Yellow
|
|
DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
|
|
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
|
|
DocItemLabel.FORMULA: (192, 192, 192), # Gray
|
|
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
|
|
DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
|
|
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
|
|
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
|
|
DocItemLabel.PAGE_FOOTER: (
|
|
204,
|
|
255,
|
|
204,
|
|
), # Light Green (same as Page-Header)
|
|
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
|
|
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
|
|
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
|
|
DocItemLabel.CODE: (125, 125, 125), # Gray
|
|
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
|
|
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
|
|
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
|
|
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
|
|
}
|
|
# Filter clusters for left and right images
|
|
exclude_labels = {
|
|
DocItemLabel.FORM,
|
|
DocItemLabel.KEY_VALUE_REGION,
|
|
DocItemLabel.PICTURE,
|
|
}
|
|
left_clusters = [c for c in clusters if c.label not in exclude_labels]
|
|
right_clusters = [c for c in clusters if c.label in exclude_labels]
|
|
# Create a deep copy of the original image for both sides
|
|
left_image = copy.deepcopy(page.image)
|
|
right_image = copy.deepcopy(page.image)
|
|
|
|
# Function to draw clusters on an image
|
|
def draw_clusters(image, clusters):
|
|
draw = ImageDraw.Draw(image, "RGBA")
|
|
# Create a smaller font for the labels
|
|
try:
|
|
font = ImageFont.truetype("arial.ttf", 12)
|
|
except OSError:
|
|
# Fallback to default font if arial is not available
|
|
font = ImageFont.load_default()
|
|
for c_tl in clusters:
|
|
all_clusters = [c_tl, *c_tl.children]
|
|
for c in all_clusters:
|
|
# Draw cells first (underneath)
|
|
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
|
for tc in c.cells:
|
|
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
|
draw.rectangle(
|
|
[(cx0, cy0), (cx1, cy1)],
|
|
outline=None,
|
|
fill=cell_color,
|
|
)
|
|
# Draw cluster rectangle
|
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
|
cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
|
|
cluster_outline_color = (*list(label_to_color.get(c.label)), 255)
|
|
draw.rectangle(
|
|
[(x0, y0), (x1, y1)],
|
|
outline=cluster_outline_color,
|
|
fill=cluster_fill_color,
|
|
)
|
|
# Add label name and confidence
|
|
label_text = f"{c.label.name} ({c.confidence:.2f})"
|
|
# Create semi-transparent background for text
|
|
text_bbox = draw.textbbox((x0, y0), label_text, font=font)
|
|
text_bg_padding = 2
|
|
draw.rectangle(
|
|
[
|
|
(
|
|
text_bbox[0] - text_bg_padding,
|
|
text_bbox[1] - text_bg_padding,
|
|
),
|
|
(
|
|
text_bbox[2] + text_bg_padding,
|
|
text_bbox[3] + text_bg_padding,
|
|
),
|
|
],
|
|
fill=(255, 255, 255, 180), # Semi-transparent white
|
|
)
|
|
# Draw text
|
|
draw.text(
|
|
(x0, y0),
|
|
label_text,
|
|
fill=(0, 0, 0, 255), # Solid black
|
|
font=font,
|
|
)
|
|
|
|
# Draw clusters on both images
|
|
draw_clusters(left_image, left_clusters)
|
|
draw_clusters(right_image, right_clusters)
|
|
# Combine the images side by side
|
|
combined_width = left_image.width * 2
|
|
combined_height = left_image.height
|
|
combined_image = Image.new("RGB", (combined_width, combined_height))
|
|
combined_image.paste(left_image, (0, 0))
|
|
combined_image.paste(right_image, (left_image.width, 0))
|
|
if show:
|
|
combined_image.show()
|
|
else:
|
|
out_path: Path = (
|
|
Path(settings.debug.debug_output_path)
|
|
/ f"debug_{conv_res.input.file.stem}"
|
|
)
|
|
out_path.mkdir(parents=True, exist_ok=True)
|
|
out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
|
|
combined_image.save(str(out_file), format="png")
|
|
|
|
def __call__(
|
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
) -> Iterable[Page]:
|
|
|
|
for page in page_batch:
|
|
assert page._backend is not None
|
|
if not page._backend.is_valid():
|
|
yield page
|
|
else:
|
|
with TimeRecorder(conv_res, "layout"):
|
|
assert page.size is not None
|
|
|
|
clusters = []
|
|
for ix, pred_item in enumerate(
|
|
self.layout_predictor.predict(page.get_image(scale=1.0))
|
|
):
|
|
label = DocItemLabel(
|
|
pred_item["label"]
|
|
.lower()
|
|
.replace(" ", "_")
|
|
.replace("-", "_")
|
|
) # Temporary, until docling-ibm-model uses docling-core types
|
|
cluster = Cluster(
|
|
id=ix,
|
|
label=label,
|
|
confidence=pred_item["confidence"],
|
|
bbox=BoundingBox.model_validate(pred_item),
|
|
cells=[],
|
|
)
|
|
clusters.append(cluster)
|
|
|
|
if settings.debug.visualize_raw_layout:
|
|
self.draw_clusters_and_cells_side_by_side(
|
|
conv_res, page, clusters, mode_prefix="raw"
|
|
)
|
|
|
|
# Apply postprocessing
|
|
|
|
processed_clusters, processed_cells = LayoutPostprocessor(
|
|
page.cells, clusters, page.size
|
|
).postprocess()
|
|
# processed_clusters, processed_cells = clusters, page.cells
|
|
|
|
page.cells = processed_cells
|
|
page.predictions.layout = LayoutPrediction(
|
|
clusters=processed_clusters
|
|
)
|
|
|
|
if settings.debug.visualize_layout:
|
|
self.draw_clusters_and_cells_side_by_side(
|
|
conv_res, page, processed_clusters, mode_prefix="postprocessed"
|
|
)
|
|
|
|
yield page
|