fix: added extraction of byte-images in excel (#804)
* fix(msexcel): ignore Mypy checking for _find_images_in_sheet function Signed-off-by: Jiun An Tsai <andrew@247365-Macbook.local> * fixed some issues Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * pinned pillow in pyproject Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Jiun An Tsai <andrew@247365-Macbook.local> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Jiun An Tsai <andrew@247365-Macbook.local> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
16a218d871
commit
a458e298ca
@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
from typing import Any, List
|
||||
|
||||
from PIL import Image as PILImage
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
@ -325,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
||||
self, doc: DoclingDocument, sheet: Worksheet
|
||||
) -> DoclingDocument:
|
||||
|
||||
# FIXME: mypy does not agree with _images ...
|
||||
# Iterate over byte images in the sheet
|
||||
for idx, image in enumerate(sheet._images): # type: ignore
|
||||
|
||||
try:
|
||||
pil_image = PILImage.open(image.ref)
|
||||
|
||||
doc.add_picture(
|
||||
parent=self.parents[0],
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
except:
|
||||
_log.error("could not extract the image from excel sheets")
|
||||
|
||||
"""
|
||||
# Iterate over images in the sheet
|
||||
for idx, image in enumerate(sheet._images): # Access embedded images
|
||||
for idx, chart in enumerate(sheet._charts): # type: ignore
|
||||
try:
|
||||
chart_path = f"chart_{idx + 1}.png"
|
||||
_log.info(
|
||||
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
||||
)
|
||||
|
||||
image_bytes = BytesIO(image.ref.blob)
|
||||
pil_image = Image.open(image_bytes)
|
||||
|
||||
doc.add_picture(
|
||||
parent=self.parents[0],
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
"""
|
||||
|
||||
# FIXME: mypy does not agree with _charts ...
|
||||
"""
|
||||
for idx, chart in enumerate(sheet._charts): # Access embedded charts
|
||||
chart_path = f"chart_{idx + 1}.png"
|
||||
_log.info(
|
||||
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
||||
)
|
||||
|
||||
_log.info(f"Chart {idx + 1}:")
|
||||
|
||||
# Chart type
|
||||
_log.info(f"Type: {type(chart).__name__}")
|
||||
|
||||
# Title
|
||||
if chart.title:
|
||||
_log.info(f"Title: {chart.title}")
|
||||
else:
|
||||
_log.info("No title")
|
||||
|
||||
# Data series
|
||||
for series in chart.series:
|
||||
_log.info(" => series ...")
|
||||
_log.info(f"Data Series: {series.title}")
|
||||
_log.info(f"Values: {series.values}")
|
||||
_log.info(f"Categories: {series.categories}")
|
||||
_log.info(f"Chart {idx + 1}:")
|
||||
|
||||
# Position
|
||||
# _log.info(f"Anchor Cell: {chart.anchor}")
|
||||
# Chart type
|
||||
# _log.info(f"Type: {type(chart).__name__}")
|
||||
print(f"Type: {type(chart).__name__}")
|
||||
|
||||
# Extract series data
|
||||
for series_idx, series in enumerate(chart.series):
|
||||
#_log.info(f"Series {series_idx + 1}:")
|
||||
print(f"Series {series_idx + 1} type: {type(series).__name__}")
|
||||
#print(f"x-values: {series.xVal}")
|
||||
#print(f"y-values: {series.yVal}")
|
||||
|
||||
print(f"xval type: {type(series.xVal).__name__}")
|
||||
|
||||
xvals = []
|
||||
for _ in series.xVal.numLit.pt:
|
||||
print(f"xval type: {type(_).__name__}")
|
||||
if hasattr(_, 'v'):
|
||||
xvals.append(_.v)
|
||||
|
||||
print(f"x-values: {xvals}")
|
||||
|
||||
yvals = []
|
||||
for _ in series.yVal:
|
||||
if hasattr(_, 'v'):
|
||||
yvals.append(_.v)
|
||||
|
||||
print(f"y-values: {yvals}")
|
||||
|
||||
except Exception as exc:
|
||||
print(exc)
|
||||
continue
|
||||
"""
|
||||
|
||||
return doc
|
||||
|
4
poetry.lock
generated
4
poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@ -7751,4 +7751,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "7fcfc061454f229745d6f305e1fa593468a684059717195c6ae4174bec13d362"
|
||||
content-hash = "08d30cee8d77f9beee32d5dbec1643367ecae2b4c4b47b57fcb337711471eb5c"
|
||||
|
@ -56,6 +56,7 @@ onnxruntime = [
|
||||
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
||||
]
|
||||
pillow = "^10.0.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||
|
@ -7,4 +7,5 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-6 at level 2: table with [5x3]
|
||||
item-7 at level 1: section: group sheet: Sheet3
|
||||
item-8 at level 2: table with [7x3]
|
||||
item-9 at level 2: table with [7x3]
|
||||
item-9 at level 2: table with [7x3]
|
||||
item-10 at level 2: picture
|
File diff suppressed because one or more lines are too long
@ -48,4 +48,6 @@
|
||||
| 3 | 4 | 5 |
|
||||
| 3 | 6 | 7 |
|
||||
| 8 | 9 | 9 |
|
||||
| 10 | 9 | 9 |
|
||||
| 10 | 9 | 9 |
|
||||
|
||||
<!-- image -->
|
Binary file not shown.
@ -53,7 +53,7 @@ def test_e2e_xlsx_conversions():
|
||||
converter = get_converter()
|
||||
|
||||
for xlsx_path in xlsx_paths:
|
||||
# print(f"converting {xlsx_path}")
|
||||
print(f"converting {xlsx_path}")
|
||||
|
||||
gt_path = (
|
||||
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
|
||||
|
Loading…
Reference in New Issue
Block a user