fix(HTML): handle row spans in header rows (#1536)
* chore(HTML): log the stacktrace of errors Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * fix(HTML): handle row headers like in pivot tables Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
3220a592e7
commit
776e7ecf9a
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import traceback
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Optional, Union, cast
|
||||
@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.analyze_tag(cast(Tag, element), doc)
|
||||
except Exception as exc_child:
|
||||
_log.error(
|
||||
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
||||
f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
|
||||
)
|
||||
raise exc_child
|
||||
elif isinstance(element, NavigableString) and not isinstance(
|
||||
@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
_log.debug(f"list-item has no text: {element}")
|
||||
|
||||
@staticmethod
|
||||
def parse_table_data(element: Tag) -> Optional[TableData]:
|
||||
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
||||
nested_tables = element.find("table")
|
||||
if nested_tables is not None:
|
||||
_log.debug("Skipping nested table.")
|
||||
return None
|
||||
|
||||
# Count the number of rows (number of <tr> elements)
|
||||
num_rows = len(element("tr"))
|
||||
|
||||
# Find the number of columns (taking into account colspan)
|
||||
# Find the number of rows and columns (taking into account spans)
|
||||
num_rows = 0
|
||||
num_cols = 0
|
||||
for row in element("tr"):
|
||||
col_count = 0
|
||||
is_row_header = True
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
for cell in row(["td", "th"]):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
val = cast(Tag, cell).get("colspan", "1")
|
||||
cell_tag = cast(Tag, cell)
|
||||
val = cell_tag.get("colspan", "1")
|
||||
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
||||
col_count += colspan
|
||||
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
|
||||
is_row_header = False
|
||||
num_cols = max(num_cols, col_count)
|
||||
if not is_row_header:
|
||||
num_rows += 1
|
||||
|
||||
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
||||
|
||||
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
# Iterate over the rows in the table
|
||||
for row_idx, row in enumerate(element("tr")):
|
||||
start_row_span = 0
|
||||
row_idx = -1
|
||||
for row in element("tr"):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
|
||||
# For each row, find all the column cells (both <td> and <th>)
|
||||
cells = row(["td", "th"])
|
||||
|
||||
# Check if each cell in the row is a header -> means it is a column header
|
||||
# Check if cell is in a column header or row header
|
||||
col_header = True
|
||||
row_header = True
|
||||
for html_cell in cells:
|
||||
if isinstance(html_cell, Tag) and html_cell.name == "td":
|
||||
col_header = False
|
||||
if isinstance(html_cell, Tag):
|
||||
if html_cell.name == "td":
|
||||
col_header = False
|
||||
row_header = False
|
||||
elif html_cell.get("rowspan") is None:
|
||||
row_header = False
|
||||
if not row_header:
|
||||
row_idx += 1
|
||||
start_row_span = 0
|
||||
else:
|
||||
start_row_span += 1
|
||||
|
||||
# Extract the text content of each cell
|
||||
col_idx = 0
|
||||
@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if isinstance(row_val, str) and row_val.isnumeric()
|
||||
else 1
|
||||
)
|
||||
|
||||
while grid[row_idx][col_idx] is not None:
|
||||
if row_header:
|
||||
row_span -= 1
|
||||
while (
|
||||
col_idx < num_cols
|
||||
and grid[row_idx + start_row_span][col_idx] is not None
|
||||
):
|
||||
col_idx += 1
|
||||
for r in range(row_span):
|
||||
for r in range(start_row_span, start_row_span + row_span):
|
||||
for c in range(col_span):
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
if row_idx + r < num_rows and col_idx + c < num_cols:
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
|
||||
table_cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_row_offset_idx=start_row_span + row_idx,
|
||||
end_row_offset_idx=start_row_span + row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
column_header=col_header,
|
||||
|
145
tests/data/html/example_8.html
Normal file
145
tests/data/html/example_8.html
Normal file
@ -0,0 +1,145 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<style>
|
||||
table,
|
||||
th,
|
||||
td {
|
||||
border: 1px solid black;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h2>Pivot table with with 1 row header</h2>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Year</th>
|
||||
<th>Month</th>
|
||||
<th>Revenue</th>
|
||||
<th>Cost</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th rowspan="6">2025</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>January</td>
|
||||
<td>$134</td>
|
||||
<td>$162</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>February</td>
|
||||
<td>$150</td>
|
||||
<td>$155</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>March</td>
|
||||
<td>$160</td>
|
||||
<td>$143</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>April</td>
|
||||
<td>$210</td>
|
||||
<td>$150</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>May</td>
|
||||
<td>$280</td>
|
||||
<td>$120</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h2>Pivot table with 2 row headers</h2>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Year</th>
|
||||
<th>Quarter</th>
|
||||
<th>Month</th>
|
||||
<th>Revenue</th>
|
||||
<th>Cost</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th rowspan="7">2025</th>
|
||||
<th rowspan="4">Q1</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>January</td>
|
||||
<td>$134</td>
|
||||
<td>$162</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>February</td>
|
||||
<td>$150</td>
|
||||
<td>$155</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>March</td>
|
||||
<td>$160</td>
|
||||
<td>$143</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th rowspan="3">Q2</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>April</td>
|
||||
<td>$210</td>
|
||||
<td>$150</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>May</td>
|
||||
<td>$280</td>
|
||||
<td>$120</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h2>Equivalent pivot table</h2>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Year</th>
|
||||
<th>Quarter</th>
|
||||
<th>Month</th>
|
||||
<th>Revenue</th>
|
||||
<th>Cost</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th rowspan="8">2025</th>
|
||||
<th rowspan="4">Q1</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>January</td>
|
||||
<td>$134</td>
|
||||
<td>$162</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>February</td>
|
||||
<td>$150</td>
|
||||
<td>$155</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>March</td>
|
||||
<td>$160</td>
|
||||
<td>$143</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th rowspan="3">Q2</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>April</td>
|
||||
<td>$210</td>
|
||||
<td>$150</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>May</td>
|
||||
<td>$280</td>
|
||||
<td>$120</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user