fix: Correct scaling of debug visualizations, tune OCR (#700)

* fix: Correct scaling of debug visualizations, tune OCR

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* chore: remove unused imports

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* chore: Update docling-core

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-01-08 12:26:44 +01:00 committed by GitHub
parent ead396ab40
commit 5cb4cf6f19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 766 additions and 839 deletions

View File

@ -138,18 +138,31 @@ class BaseOcrModel(BasePageModel):
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image)
scale_x = image.width / page.size.width
scale_y = image.height / page.size.height
draw = ImageDraw.Draw(image, "RGBA")
# Draw OCR rectangles as yellow filled rect
for rect in ocr_rects:
x0, y0, x1, y1 = rect.as_tuple()
y0 *= scale_x
y1 *= scale_y
x0 *= scale_x
x1 *= scale_x
shade_color = (255, 255, 0, 40) # transparent yellow
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
# Draw OCR and programmatic cells
for tc in page.cells:
x0, y0, x1, y1 = tc.bbox.as_tuple()
color = "red"
y0 *= scale_x
y1 *= scale_y
x0 *= scale_x
x1 *= scale_x
color = "gray"
if isinstance(tc, OcrCell):
color = "magenta"
draw.rectangle([(x0, y0), (x1, y1)], outline=color)

View File

@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
Includes label names and confidence scores for each cluster.
"""
label_to_color = {
DocItemLabel.TEXT: (255, 255, 153), # Light Yellow
DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
DocItemLabel.FORMULA: (192, 192, 192), # Gray
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
DocItemLabel.PAGE_FOOTER: (
204,
255,
204,
), # Light Green (same as Page-Header)
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
DocItemLabel.CODE: (125, 125, 125), # Gray
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
}
scale_x = page.image.width / page.size.width
scale_y = page.image.height / page.size.height
# Filter clusters for left and right images
exclude_labels = {
DocItemLabel.FORM,
@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
cell_color = (0, 0, 0, 40) # Transparent black for cells
for tc in c.cells:
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
cx0 *= scale_x
cx1 *= scale_x
cy0 *= scale_x
cy1 *= scale_y
draw.rectangle(
[(cx0, cy0), (cx1, cy1)],
outline=None,
@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
)
# Draw cluster rectangle
x0, y0, x1, y1 = c.bbox.as_tuple()
cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
cluster_outline_color = (*list(label_to_color.get(c.label)), 255)
x0 *= scale_x
x1 *= scale_x
y0 *= scale_x
y1 *= scale_y
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
cluster_outline_color = (
*list(DocItemLabel.get_color(c.label)),
255,
)
draw.rectangle(
[(x0, y0), (x1, y1)],
outline=cluster_outline_color,

View File

@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
show: bool = False,
):
assert page._backend is not None
assert page.size is not None
image = (
page._backend.get_page_image()
) # make new image to avoid drawing on the saved ones
scale_x = image.width / page.size.width
scale_y = image.height / page.size.height
draw = ImageDraw.Draw(image)
for table_element in tbl_list:
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
y0 *= scale_x
y1 *= scale_y
x0 *= scale_x
x1 *= scale_x
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
for cell in table_element.cluster.cells:
x0, y0, x1, y1 = cell.bbox.as_tuple()
x0 *= scale_x
x1 *= scale_x
y0 *= scale_x
y1 *= scale_y
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
for tc in table_element.table_cells:
if tc.bbox is not None:
x0, y0, x1, y1 = tc.bbox.as_tuple()
x0 *= scale_x
x1 *= scale_x
y0 *= scale_x
y1 *= scale_y
if tc.column_header:
width = 3
else:

1525
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -25,7 +25,7 @@ packages = [{include = "docling"}]
# actual dependencies:
######################
python = "^3.9"
docling-core = { version = "^2.12.1", extras = ["chunking"] }
docling-core = { version = "^2.13.1", extras = ["chunking"] }
pydantic = "^2.0.0"
docling-ibm-models = "^3.1.0"
deepsearch-glm = "^1.0.0"