fix(HTML): handle row spans in header rows (#1536)

* chore(HTML): log the stacktrace of errors

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* fix(HTML): handle row headers like in pivot tables

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-05-09 15:14:32 +02:00 committed by GitHub
parent 3220a592e7
commit 776e7ecf9a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 186 additions and 17 deletions

View File

@ -1,4 +1,5 @@
import logging
import traceback
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union, cast
@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child:
_log.error(
f"Error processing child from tag {tag.name}: {exc_child!r}"
f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
)
raise exc_child
elif isinstance(element, NavigableString) and not isinstance(
@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f"list-item has no text: {element}")
@staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]:
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
nested_tables = element.find("table")
if nested_tables is not None:
_log.debug("Skipping nested table.")
return None
# Count the number of rows (number of <tr> elements)
num_rows = len(element("tr"))
# Find the number of columns (taking into account colspan)
# Find the number of rows and columns (taking into account spans)
num_rows = 0
num_cols = 0
for row in element("tr"):
col_count = 0
is_row_header = True
if not isinstance(row, Tag):
continue
for cell in row(["td", "th"]):
if not isinstance(row, Tag):
continue
val = cast(Tag, cell).get("colspan", "1")
cell_tag = cast(Tag, cell)
val = cell_tag.get("colspan", "1")
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
col_count += colspan
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
is_row_header = False
num_cols = max(num_cols, col_count)
if not is_row_header:
num_rows += 1
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
for row_idx, row in enumerate(element("tr")):
start_row_span = 0
row_idx = -1
for row in element("tr"):
if not isinstance(row, Tag):
continue
# For each row, find all the column cells (both <td> and <th>)
cells = row(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
# Check if cell is in a column header or row header
col_header = True
row_header = True
for html_cell in cells:
if isinstance(html_cell, Tag) and html_cell.name == "td":
col_header = False
if isinstance(html_cell, Tag):
if html_cell.name == "td":
col_header = False
row_header = False
elif html_cell.get("rowspan") is None:
row_header = False
if not row_header:
row_idx += 1
start_row_span = 0
else:
start_row_span += 1
# Extract the text content of each cell
col_idx = 0
@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(row_val, str) and row_val.isnumeric()
else 1
)
while grid[row_idx][col_idx] is not None:
if row_header:
row_span -= 1
while (
col_idx < num_cols
and grid[row_idx + start_row_span][col_idx] is not None
):
col_idx += 1
for r in range(row_span):
for r in range(start_row_span, start_row_span + row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
if row_idx + r < num_rows and col_idx + c < num_cols:
grid[row_idx + r][col_idx + c] = text
table_cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_row_offset_idx=start_row_span + row_idx,
end_row_offset_idx=start_row_span + row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
column_header=col_header,

View File

@ -0,0 +1,145 @@
<!DOCTYPE html>
<html>
<head>
<style>
table,
th,
td {
border: 1px solid black;
}
</style>
</head>
<body>
<h2>Pivot table with with 1 row header</h2>
<table>
<tr>
<th>Year</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="6">2025</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
<h2>Pivot table with 2 row headers</h2>
<table>
<tr>
<th>Year</th>
<th>Quarter</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="7">2025</th>
<th rowspan="4">Q1</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<th rowspan="3">Q2</th>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
<h2>Equivalent pivot table</h2>
<table>
<tr>
<th>Year</th>
<th>Quarter</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="8">2025</th>
<th rowspan="4">Q1</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<th rowspan="3">Q2</th>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
</body>
</html>