fix: added extraction of byte-images in excel (#804)

* fix(msexcel): ignore Mypy checking for _find_images_in_sheet function

Signed-off-by: Jiun An Tsai <andrew@247365-Macbook.local>

* fixed some issues

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* pinned pillow in pyproject

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Jiun An Tsai <andrew@247365-Macbook.local>
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Jiun An Tsai <andrew@247365-Macbook.local>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar 2025-01-24 18:48:02 +01:00 committed by GitHub
parent 16a218d871
commit a458e298ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 90 additions and 47 deletions

View File

@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
from typing import Any, List from typing import Any, List
from PIL import Image as PILImage
from pydantic import BaseModel from pydantic import BaseModel
@ -325,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
self, doc: DoclingDocument, sheet: Worksheet self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument: ) -> DoclingDocument:
# FIXME: mypy does not agree with _images ... # Iterate over byte images in the sheet
for idx, image in enumerate(sheet._images): # type: ignore
try:
pil_image = PILImage.open(image.ref)
doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except:
_log.error("could not extract the image from excel sheets")
""" """
# Iterate over images in the sheet for idx, chart in enumerate(sheet._charts): # type: ignore
for idx, image in enumerate(sheet._images): # Access embedded images try:
chart_path = f"chart_{idx + 1}.png"
_log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}"
)
image_bytes = BytesIO(image.ref.blob) _log.info(f"Chart {idx + 1}:")
pil_image = Image.open(image_bytes)
doc.add_picture( # Chart type
parent=self.parents[0], # _log.info(f"Type: {type(chart).__name__}")
image=ImageRef.from_pil(image=pil_image, dpi=72), print(f"Type: {type(chart).__name__}")
caption=None,
)
"""
# FIXME: mypy does not agree with _charts ... # Extract series data
""" for series_idx, series in enumerate(chart.series):
for idx, chart in enumerate(sheet._charts): # Access embedded charts #_log.info(f"Series {series_idx + 1}:")
chart_path = f"chart_{idx + 1}.png" print(f"Series {series_idx + 1} type: {type(series).__name__}")
_log.info( #print(f"x-values: {series.xVal}")
f"Chart found, but dynamic rendering is required for: {chart_path}" #print(f"y-values: {series.yVal}")
)
_log.info(f"Chart {idx + 1}:") print(f"xval type: {type(series.xVal).__name__}")
# Chart type xvals = []
_log.info(f"Type: {type(chart).__name__}") for _ in series.xVal.numLit.pt:
print(f"xval type: {type(_).__name__}")
if hasattr(_, 'v'):
xvals.append(_.v)
# Title print(f"x-values: {xvals}")
if chart.title:
_log.info(f"Title: {chart.title}")
else:
_log.info("No title")
# Data series yvals = []
for series in chart.series: for _ in series.yVal:
_log.info(" => series ...") if hasattr(_, 'v'):
_log.info(f"Data Series: {series.title}") yvals.append(_.v)
_log.info(f"Values: {series.values}")
_log.info(f"Categories: {series.categories}")
# Position print(f"y-values: {yvals}")
# _log.info(f"Anchor Cell: {chart.anchor}")
except Exception as exc:
print(exc)
continue
""" """
return doc return doc

4
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
[[package]] [[package]]
name = "aiohappyeyeballs" name = "aiohappyeyeballs"
@ -7751,4 +7751,4 @@ tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "7fcfc061454f229745d6f305e1fa593468a684059717195c6ae4174bec13d362" content-hash = "08d30cee8d77f9beee32d5dbec1643367ecae2b4c4b47b57fcb337711471eb5c"

View File

@ -56,6 +56,7 @@ onnxruntime = [
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" }, { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
] ]
pillow = "^10.0.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"} black = {extras = ["jupyter"], version = "^24.4.2"}

View File

@ -8,3 +8,4 @@ item-0 at level 0: unspecified: group _root_
item-7 at level 1: section: group sheet: Sheet3 item-7 at level 1: section: group sheet: Sheet3
item-8 at level 2: table with [7x3] item-8 at level 2: table with [7x3]
item-9 at level 2: table with [7x3] item-9 at level 2: table with [7x3]
item-10 at level 2: picture

File diff suppressed because one or more lines are too long

View File

@ -49,3 +49,5 @@
| 3 | 6 | 7 | | 3 | 6 | 7 |
| 8 | 9 | 9 | | 8 | 9 | 9 |
| 10 | 9 | 9 | | 10 | 9 | 9 |
<!-- image -->

Binary file not shown.

View File

@ -53,7 +53,7 @@ def test_e2e_xlsx_conversions():
converter = get_converter() converter = get_converter()
for xlsx_path in xlsx_paths: for xlsx_path in xlsx_paths:
# print(f"converting {xlsx_path}") print(f"converting {xlsx_path}")
gt_path = ( gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name