diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 88a315d..7c71690 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,4 +1,5 @@
import logging
+import traceback
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union, cast
@@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child:
_log.error(
- f"Error processing child from tag {tag.name}: {exc_child!r}"
+ f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
)
raise exc_child
elif isinstance(element, NavigableString) and not isinstance(
@@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f"list-item has no text: {element}")
@staticmethod
- def parse_table_data(element: Tag) -> Optional[TableData]:
+ def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
nested_tables = element.find("table")
if nested_tables is not None:
_log.debug("Skipping nested table.")
return None
- # Count the number of rows (number of
elements)
- num_rows = len(element("tr"))
-
- # Find the number of columns (taking into account colspan)
+ # Find the number of rows and columns (taking into account spans)
+ num_rows = 0
num_cols = 0
for row in element("tr"):
col_count = 0
+ is_row_header = True
if not isinstance(row, Tag):
continue
for cell in row(["td", "th"]):
if not isinstance(row, Tag):
continue
- val = cast(Tag, cell).get("colspan", "1")
+ cell_tag = cast(Tag, cell)
+ val = cell_tag.get("colspan", "1")
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
col_count += colspan
+ if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
+ is_row_header = False
num_cols = max(num_cols, col_count)
+ if not is_row_header:
+ num_rows += 1
+
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
- for row_idx, row in enumerate(element("tr")):
+ start_row_span = 0
+ row_idx = -1
+ for row in element("tr"):
if not isinstance(row, Tag):
continue
# For each row, find all the column cells (both and | )
cells = row(["td", "th"])
- # Check if each cell in the row is a header -> means it is a column header
+ # Check if cell is in a column header or row header
col_header = True
+ row_header = True
for html_cell in cells:
- if isinstance(html_cell, Tag) and html_cell.name == "td":
- col_header = False
+ if isinstance(html_cell, Tag):
+ if html_cell.name == "td":
+ col_header = False
+ row_header = False
+ elif html_cell.get("rowspan") is None:
+ row_header = False
+ if not row_header:
+ row_idx += 1
+ start_row_span = 0
+ else:
+ start_row_span += 1
# Extract the text content of each cell
col_idx = 0
@@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(row_val, str) and row_val.isnumeric()
else 1
)
-
- while grid[row_idx][col_idx] is not None:
+ if row_header:
+ row_span -= 1
+ while (
+ col_idx < num_cols
+ and grid[row_idx + start_row_span][col_idx] is not None
+ ):
col_idx += 1
- for r in range(row_span):
+ for r in range(start_row_span, start_row_span + row_span):
for c in range(col_span):
- grid[row_idx + r][col_idx + c] = text
+ if row_idx + r < num_rows and col_idx + c < num_cols:
+ grid[row_idx + r][col_idx + c] = text
table_cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
- start_row_offset_idx=row_idx,
- end_row_offset_idx=row_idx + row_span,
+ start_row_offset_idx=start_row_span + row_idx,
+ end_row_offset_idx=start_row_span + row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
column_header=col_header,
diff --git a/tests/data/html/example_8.html b/tests/data/html/example_8.html
new file mode 100644
index 0000000..bcd771b
--- /dev/null
+++ b/tests/data/html/example_8.html
@@ -0,0 +1,145 @@
+
+
+
+
+
+
+
+
+
+ Pivot table with with 1 row header
+
+
+
+ Year |
+ Month |
+ Revenue |
+ Cost |
+
+
+ 2025 |
+
+
+ January |
+ $134 |
+ $162 |
+
+
+ February |
+ $150 |
+ $155 |
+
+
+ March |
+ $160 |
+ $143 |
+
+
+ April |
+ $210 |
+ $150 |
+
+
+ May |
+ $280 |
+ $120 |
+
+
+
+ Pivot table with 2 row headers
+
+
+
+ Year |
+ Quarter |
+ Month |
+ Revenue |
+ Cost |
+
+
+ 2025 |
+ Q1 |
+
+
+ January |
+ $134 |
+ $162 |
+
+
+ February |
+ $150 |
+ $155 |
+
+
+ March |
+ $160 |
+ $143 |
+
+
+ Q2 |
+
+
+ April |
+ $210 |
+ $150 |
+
+
+ May |
+ $280 |
+ $120 |
+
+
+
+ Equivalent pivot table
+
+
+
+ Year |
+ Quarter |
+ Month |
+ Revenue |
+ Cost |
+
+
+ 2025 |
+ Q1 |
+
+
+ January |
+ $134 |
+ $162 |
+
+
+ February |
+ $150 |
+ $155 |
+
+
+ March |
+ $160 |
+ $143 |
+
+
+ Q2 |
+
+
+ April |
+ $210 |
+ $150 |
+
+
+ May |
+ $280 |
+ $120 |
+
+
+
+
+
+
\ No newline at end of file
|