fix(HTML): handle row spans in header rows (#1536)

* chore(HTML): log the stacktrace of errors

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* fix(HTML): handle row headers like in pivot tables

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-05-09 15:14:32 +02:00 committed by GitHub
parent 3220a592e7
commit 776e7ecf9a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 186 additions and 17 deletions

View File

@ -1,4 +1,5 @@
import logging import logging
import traceback
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Final, Optional, Union, cast from typing import Final, Optional, Union, cast
@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.analyze_tag(cast(Tag, element), doc) self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child: except Exception as exc_child:
_log.error( _log.error(
f"Error processing child from tag {tag.name}: {exc_child!r}" f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
) )
raise exc_child raise exc_child
elif isinstance(element, NavigableString) and not isinstance( elif isinstance(element, NavigableString) and not isinstance(
@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f"list-item has no text: {element}") _log.debug(f"list-item has no text: {element}")
@staticmethod @staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]: def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
nested_tables = element.find("table") nested_tables = element.find("table")
if nested_tables is not None: if nested_tables is not None:
_log.debug("Skipping nested table.") _log.debug("Skipping nested table.")
return None return None
# Count the number of rows (number of <tr> elements) # Find the number of rows and columns (taking into account spans)
num_rows = len(element("tr")) num_rows = 0
# Find the number of columns (taking into account colspan)
num_cols = 0 num_cols = 0
for row in element("tr"): for row in element("tr"):
col_count = 0 col_count = 0
is_row_header = True
if not isinstance(row, Tag): if not isinstance(row, Tag):
continue continue
for cell in row(["td", "th"]): for cell in row(["td", "th"]):
if not isinstance(row, Tag): if not isinstance(row, Tag):
continue continue
val = cast(Tag, cell).get("colspan", "1") cell_tag = cast(Tag, cell)
val = cell_tag.get("colspan", "1")
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1 colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
col_count += colspan col_count += colspan
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
is_row_header = False
num_cols = max(num_cols, col_count) num_cols = max(num_cols, col_count)
if not is_row_header:
num_rows += 1
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)] grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table # Iterate over the rows in the table
for row_idx, row in enumerate(element("tr")): start_row_span = 0
row_idx = -1
for row in element("tr"):
if not isinstance(row, Tag): if not isinstance(row, Tag):
continue continue
# For each row, find all the column cells (both <td> and <th>) # For each row, find all the column cells (both <td> and <th>)
cells = row(["td", "th"]) cells = row(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header # Check if cell is in a column header or row header
col_header = True col_header = True
row_header = True
for html_cell in cells: for html_cell in cells:
if isinstance(html_cell, Tag) and html_cell.name == "td": if isinstance(html_cell, Tag):
col_header = False if html_cell.name == "td":
col_header = False
row_header = False
elif html_cell.get("rowspan") is None:
row_header = False
if not row_header:
row_idx += 1
start_row_span = 0
else:
start_row_span += 1
# Extract the text content of each cell # Extract the text content of each cell
col_idx = 0 col_idx = 0
@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(row_val, str) and row_val.isnumeric() if isinstance(row_val, str) and row_val.isnumeric()
else 1 else 1
) )
if row_header:
while grid[row_idx][col_idx] is not None: row_span -= 1
while (
col_idx < num_cols
and grid[row_idx + start_row_span][col_idx] is not None
):
col_idx += 1 col_idx += 1
for r in range(row_span): for r in range(start_row_span, start_row_span + row_span):
for c in range(col_span): for c in range(col_span):
grid[row_idx + r][col_idx + c] = text if row_idx + r < num_rows and col_idx + c < num_cols:
grid[row_idx + r][col_idx + c] = text
table_cell = TableCell( table_cell = TableCell(
text=text, text=text,
row_span=row_span, row_span=row_span,
col_span=col_span, col_span=col_span,
start_row_offset_idx=row_idx, start_row_offset_idx=start_row_span + row_idx,
end_row_offset_idx=row_idx + row_span, end_row_offset_idx=start_row_span + row_idx + row_span,
start_col_offset_idx=col_idx, start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span, end_col_offset_idx=col_idx + col_span,
column_header=col_header, column_header=col_header,

View File

@ -0,0 +1,145 @@
<!DOCTYPE html>
<html>
<head>
<style>
table,
th,
td {
border: 1px solid black;
}
</style>
</head>
<body>
<h2>Pivot table with with 1 row header</h2>
<table>
<tr>
<th>Year</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="6">2025</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
<h2>Pivot table with 2 row headers</h2>
<table>
<tr>
<th>Year</th>
<th>Quarter</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="7">2025</th>
<th rowspan="4">Q1</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<th rowspan="3">Q2</th>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
<h2>Equivalent pivot table</h2>
<table>
<tr>
<th>Year</th>
<th>Quarter</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="8">2025</th>
<th rowspan="4">Q1</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<th rowspan="3">Q2</th>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
</body>
</html>