fix: Correct scaling of debug visualizations, tune OCR (#700)

* fix: Correct scaling of debug visualizations, tune OCR

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* chore: remove unused imports

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* chore: Update docling-core

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-01-08 12:26:44 +01:00 committed by GitHub
parent ead396ab40
commit 5cb4cf6f19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 766 additions and 839 deletions

View File

@ -138,18 +138,31 @@ class BaseOcrModel(BasePageModel):
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False): def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image) image = copy.deepcopy(page.image)
scale_x = image.width / page.size.width
scale_y = image.height / page.size.height
draw = ImageDraw.Draw(image, "RGBA") draw = ImageDraw.Draw(image, "RGBA")
# Draw OCR rectangles as yellow filled rect # Draw OCR rectangles as yellow filled rect
for rect in ocr_rects: for rect in ocr_rects:
x0, y0, x1, y1 = rect.as_tuple() x0, y0, x1, y1 = rect.as_tuple()
y0 *= scale_x
y1 *= scale_y
x0 *= scale_x
x1 *= scale_x
shade_color = (255, 255, 0, 40) # transparent yellow shade_color = (255, 255, 0, 40) # transparent yellow
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None) draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
# Draw OCR and programmatic cells # Draw OCR and programmatic cells
for tc in page.cells: for tc in page.cells:
x0, y0, x1, y1 = tc.bbox.as_tuple() x0, y0, x1, y1 = tc.bbox.as_tuple()
color = "red" y0 *= scale_x
y1 *= scale_y
x0 *= scale_x
x1 *= scale_x
color = "gray"
if isinstance(tc, OcrCell): if isinstance(tc, OcrCell):
color = "magenta" color = "magenta"
draw.rectangle([(x0, y0), (x1, y1)], outline=color) draw.rectangle([(x0, y0), (x1, y1)], outline=color)

View File

@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE. - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
Includes label names and confidence scores for each cluster. Includes label names and confidence scores for each cluster.
""" """
label_to_color = { scale_x = page.image.width / page.size.width
DocItemLabel.TEXT: (255, 255, 153), # Light Yellow scale_y = page.image.height / page.size.height
DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
DocItemLabel.FORMULA: (192, 192, 192), # Gray
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
DocItemLabel.PAGE_FOOTER: (
204,
255,
204,
), # Light Green (same as Page-Header)
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
DocItemLabel.CODE: (125, 125, 125), # Gray
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
}
# Filter clusters for left and right images # Filter clusters for left and right images
exclude_labels = { exclude_labels = {
DocItemLabel.FORM, DocItemLabel.FORM,
@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
cell_color = (0, 0, 0, 40) # Transparent black for cells cell_color = (0, 0, 0, 40) # Transparent black for cells
for tc in c.cells: for tc in c.cells:
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple() cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
cx0 *= scale_x
cx1 *= scale_x
cy0 *= scale_x
cy1 *= scale_y
draw.rectangle( draw.rectangle(
[(cx0, cy0), (cx1, cy1)], [(cx0, cy0), (cx1, cy1)],
outline=None, outline=None,
@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
) )
# Draw cluster rectangle # Draw cluster rectangle
x0, y0, x1, y1 = c.bbox.as_tuple() x0, y0, x1, y1 = c.bbox.as_tuple()
cluster_fill_color = (*list(label_to_color.get(c.label)), 70) x0 *= scale_x
cluster_outline_color = (*list(label_to_color.get(c.label)), 255) x1 *= scale_x
y0 *= scale_x
y1 *= scale_y
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
cluster_outline_color = (
*list(DocItemLabel.get_color(c.label)),
255,
)
draw.rectangle( draw.rectangle(
[(x0, y0), (x1, y1)], [(x0, y0), (x1, y1)],
outline=cluster_outline_color, outline=cluster_outline_color,

View File

@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
show: bool = False, show: bool = False,
): ):
assert page._backend is not None assert page._backend is not None
assert page.size is not None
image = ( image = (
page._backend.get_page_image() page._backend.get_page_image()
) # make new image to avoid drawing on the saved ones ) # make new image to avoid drawing on the saved ones
scale_x = image.width / page.size.width
scale_y = image.height / page.size.height
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
for table_element in tbl_list: for table_element in tbl_list:
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple() x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
y0 *= scale_x
y1 *= scale_y
x0 *= scale_x
x1 *= scale_x
draw.rectangle([(x0, y0), (x1, y1)], outline="red") draw.rectangle([(x0, y0), (x1, y1)], outline="red")
for cell in table_element.cluster.cells: for cell in table_element.cluster.cells:
x0, y0, x1, y1 = cell.bbox.as_tuple() x0, y0, x1, y1 = cell.bbox.as_tuple()
x0 *= scale_x
x1 *= scale_x
y0 *= scale_x
y1 *= scale_y
draw.rectangle([(x0, y0), (x1, y1)], outline="green") draw.rectangle([(x0, y0), (x1, y1)], outline="green")
for tc in table_element.table_cells: for tc in table_element.table_cells:
if tc.bbox is not None: if tc.bbox is not None:
x0, y0, x1, y1 = tc.bbox.as_tuple() x0, y0, x1, y1 = tc.bbox.as_tuple()
x0 *= scale_x
x1 *= scale_x
y0 *= scale_x
y1 *= scale_y
if tc.column_header: if tc.column_header:
width = 3 width = 3
else: else:

1525
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -25,7 +25,7 @@ packages = [{include = "docling"}]
# actual dependencies: # actual dependencies:
###################### ######################
python = "^3.9" python = "^3.9"
docling-core = { version = "^2.12.1", extras = ["chunking"] } docling-core = { version = "^2.13.1", extras = ["chunking"] }
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-ibm-models = "^3.1.0" docling-ibm-models = "^3.1.0"
deepsearch-glm = "^1.0.0" deepsearch-glm = "^1.0.0"