From 5e30381c0dd3b4f9e3b2d8af3863ed51fa51194a Mon Sep 17 00:00:00 2001 From: Matteo <43417658+Matteo-Omenetti@users.noreply.github.com> Date: Tue, 11 Mar 2025 09:15:28 +0000 Subject: [PATCH] perf: New revision code formula model and document picture classifier (#1140) * new version code formula model Signed-off-by: Matteo-Omenetti * new version document picture classifier Signed-off-by: Matteo-Omenetti * new code formula model Signed-off-by: Matteo-Omenetti * restored original code formula test pdf Signed-off-by: Matteo-Omenetti --------- Signed-off-by: Matteo-Omenetti Co-authored-by: Matteo-Omenetti --- docling/models/code_formula_model.py | 89 +++++++++++++++++-- docling/models/document_picture_classifier.py | 2 +- 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py index 1a0f0bf..10426c2 100644 --- a/docling/models/code_formula_model.py +++ b/docling/models/code_formula_model.py @@ -1,4 +1,5 @@ import re +from collections import Counter from pathlib import Path from typing import Iterable, List, Literal, Optional, Tuple, Union @@ -11,7 +12,7 @@ from docling_core.types.doc import ( TextItem, ) from docling_core.types.doc.labels import CodeLanguageLabel -from PIL import Image +from PIL import Image, ImageOps from pydantic import BaseModel from docling.datamodel.base_models import ItemAndImageEnrichmentElement @@ -65,7 +66,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): _model_repo_folder = "ds4sd--CodeFormula" elements_batch_size = 5 images_scale = 1.66 # = 120 dpi, aligned with training data resolution - expansion_factor = 0.03 + expansion_factor = 0.18 def __init__( self, @@ -124,7 +125,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): repo_id="ds4sd/CodeFormula", force_download=force, local_dir=local_dir, - revision="v1.0.1", + revision="v1.0.2", ) return Path(download_path) @@ -175,7 +176,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): - The second element is the extracted language if a match is found; otherwise, `None`. """ - pattern = r"^<_([^>]+)_>\s*(.*)" + pattern = r"^<_([^_>]+)_>\s(.*)" match = re.match(pattern, input_string, flags=re.DOTALL) if match: language = str(match.group(1)) # the captured programming language @@ -206,6 +207,82 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): except ValueError: return CodeLanguageLabel.UNKNOWN + def _get_most_frequent_edge_color(self, pil_img: Image.Image): + """ + Compute the most frequent color along the outer edges of a PIL image. + + Parameters + ---------- + pil_img : Image.Image + A PIL Image in any mode (L, RGB, RGBA, etc.). + + Returns + ------- + (int) or (tuple): The most common edge color as a scalar (for grayscale) or + tuple (for RGB/RGBA). + """ + # Convert to NumPy array for easy pixel access + img_np = np.array(pil_img) + + if img_np.ndim == 2: + # Grayscale-like image: shape (H, W) + # Extract edges: top row, bottom row, left col, right col + top = img_np[0, :] # shape (W,) + bottom = img_np[-1, :] # shape (W,) + left = img_np[:, 0] # shape (H,) + right = img_np[:, -1] # shape (H,) + + # Concatenate all edges + edges = np.concatenate([top, bottom, left, right]) + + # Count frequencies + freq = Counter(edges.tolist()) + most_common_value, _ = freq.most_common(1)[0] + return int(most_common_value) # single channel color + + else: + # Color image: shape (H, W, C) + top = img_np[0, :, :] # shape (W, C) + bottom = img_np[-1, :, :] # shape (W, C) + left = img_np[:, 0, :] # shape (H, C) + right = img_np[:, -1, :] # shape (H, C) + + # Concatenate edges along first axis + edges = np.concatenate([top, bottom, left, right], axis=0) + + # Convert each color to a tuple for counting + edges_as_tuples = [tuple(pixel) for pixel in edges] + freq = Counter(edges_as_tuples) + most_common_value, _ = freq.most_common(1)[0] + return most_common_value # e.g. (R, G, B) or (R, G, B, A) + + def _pad_with_most_frequent_edge_color( + self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int] + ): + """ + Pads an image (PIL or NumPy array) using the most frequent edge color. + + Parameters + ---------- + img : Union[Image.Image, np.ndarray] + The original image. + padding : tuple + Padding (left, top, right, bottom) in pixels. + + Returns + ------- + Image.Image: A new PIL image with the specified padding. + """ + if isinstance(img, np.ndarray): + pil_img = Image.fromarray(img) + else: + pil_img = img + + most_freq_color = self._get_most_frequent_edge_color(pil_img) + + padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color) + return padded_img + def __call__( self, doc: DoclingDocument, @@ -238,7 +315,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): assert isinstance(el.item, TextItem) elements.append(el.item) labels.append(el.item.label) - images.append(el.image) + images.append( + self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10)) + ) outputs = self.code_formula_model.predict(images, labels) diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index 6e71246..f51d735 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -113,7 +113,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel): repo_id="ds4sd/DocumentFigureClassifier", force_download=force, local_dir=local_dir, - revision="v1.0.0", + revision="v1.0.1", ) return Path(download_path)