fix(HTML): handle row spans in header rows (#1536)
* chore(HTML): log the stacktrace of errors Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * fix(HTML): handle row headers like in pivot tables Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
3220a592e7
commit
776e7ecf9a
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import traceback
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Final, Optional, Union, cast
|
from typing import Final, Optional, Union, cast
|
||||||
@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.analyze_tag(cast(Tag, element), doc)
|
self.analyze_tag(cast(Tag, element), doc)
|
||||||
except Exception as exc_child:
|
except Exception as exc_child:
|
||||||
_log.error(
|
_log.error(
|
||||||
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
|
||||||
)
|
)
|
||||||
raise exc_child
|
raise exc_child
|
||||||
elif isinstance(element, NavigableString) and not isinstance(
|
elif isinstance(element, NavigableString) and not isinstance(
|
||||||
@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug(f"list-item has no text: {element}")
|
_log.debug(f"list-item has no text: {element}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_table_data(element: Tag) -> Optional[TableData]:
|
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
||||||
nested_tables = element.find("table")
|
nested_tables = element.find("table")
|
||||||
if nested_tables is not None:
|
if nested_tables is not None:
|
||||||
_log.debug("Skipping nested table.")
|
_log.debug("Skipping nested table.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Count the number of rows (number of <tr> elements)
|
# Find the number of rows and columns (taking into account spans)
|
||||||
num_rows = len(element("tr"))
|
num_rows = 0
|
||||||
|
|
||||||
# Find the number of columns (taking into account colspan)
|
|
||||||
num_cols = 0
|
num_cols = 0
|
||||||
for row in element("tr"):
|
for row in element("tr"):
|
||||||
col_count = 0
|
col_count = 0
|
||||||
|
is_row_header = True
|
||||||
if not isinstance(row, Tag):
|
if not isinstance(row, Tag):
|
||||||
continue
|
continue
|
||||||
for cell in row(["td", "th"]):
|
for cell in row(["td", "th"]):
|
||||||
if not isinstance(row, Tag):
|
if not isinstance(row, Tag):
|
||||||
continue
|
continue
|
||||||
val = cast(Tag, cell).get("colspan", "1")
|
cell_tag = cast(Tag, cell)
|
||||||
|
val = cell_tag.get("colspan", "1")
|
||||||
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
||||||
col_count += colspan
|
col_count += colspan
|
||||||
|
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
|
||||||
|
is_row_header = False
|
||||||
num_cols = max(num_cols, col_count)
|
num_cols = max(num_cols, col_count)
|
||||||
|
if not is_row_header:
|
||||||
|
num_rows += 1
|
||||||
|
|
||||||
|
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
||||||
|
|
||||||
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||||
|
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||||
|
|
||||||
# Iterate over the rows in the table
|
# Iterate over the rows in the table
|
||||||
for row_idx, row in enumerate(element("tr")):
|
start_row_span = 0
|
||||||
|
row_idx = -1
|
||||||
|
for row in element("tr"):
|
||||||
if not isinstance(row, Tag):
|
if not isinstance(row, Tag):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# For each row, find all the column cells (both <td> and <th>)
|
# For each row, find all the column cells (both <td> and <th>)
|
||||||
cells = row(["td", "th"])
|
cells = row(["td", "th"])
|
||||||
|
|
||||||
# Check if each cell in the row is a header -> means it is a column header
|
# Check if cell is in a column header or row header
|
||||||
col_header = True
|
col_header = True
|
||||||
|
row_header = True
|
||||||
for html_cell in cells:
|
for html_cell in cells:
|
||||||
if isinstance(html_cell, Tag) and html_cell.name == "td":
|
if isinstance(html_cell, Tag):
|
||||||
col_header = False
|
if html_cell.name == "td":
|
||||||
|
col_header = False
|
||||||
|
row_header = False
|
||||||
|
elif html_cell.get("rowspan") is None:
|
||||||
|
row_header = False
|
||||||
|
if not row_header:
|
||||||
|
row_idx += 1
|
||||||
|
start_row_span = 0
|
||||||
|
else:
|
||||||
|
start_row_span += 1
|
||||||
|
|
||||||
# Extract the text content of each cell
|
# Extract the text content of each cell
|
||||||
col_idx = 0
|
col_idx = 0
|
||||||
@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if isinstance(row_val, str) and row_val.isnumeric()
|
if isinstance(row_val, str) and row_val.isnumeric()
|
||||||
else 1
|
else 1
|
||||||
)
|
)
|
||||||
|
if row_header:
|
||||||
while grid[row_idx][col_idx] is not None:
|
row_span -= 1
|
||||||
|
while (
|
||||||
|
col_idx < num_cols
|
||||||
|
and grid[row_idx + start_row_span][col_idx] is not None
|
||||||
|
):
|
||||||
col_idx += 1
|
col_idx += 1
|
||||||
for r in range(row_span):
|
for r in range(start_row_span, start_row_span + row_span):
|
||||||
for c in range(col_span):
|
for c in range(col_span):
|
||||||
grid[row_idx + r][col_idx + c] = text
|
if row_idx + r < num_rows and col_idx + c < num_cols:
|
||||||
|
grid[row_idx + r][col_idx + c] = text
|
||||||
|
|
||||||
table_cell = TableCell(
|
table_cell = TableCell(
|
||||||
text=text,
|
text=text,
|
||||||
row_span=row_span,
|
row_span=row_span,
|
||||||
col_span=col_span,
|
col_span=col_span,
|
||||||
start_row_offset_idx=row_idx,
|
start_row_offset_idx=start_row_span + row_idx,
|
||||||
end_row_offset_idx=row_idx + row_span,
|
end_row_offset_idx=start_row_span + row_idx + row_span,
|
||||||
start_col_offset_idx=col_idx,
|
start_col_offset_idx=col_idx,
|
||||||
end_col_offset_idx=col_idx + col_span,
|
end_col_offset_idx=col_idx + col_span,
|
||||||
column_header=col_header,
|
column_header=col_header,
|
||||||
|
145
tests/data/html/example_8.html
Normal file
145
tests/data/html/example_8.html
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
|
||||||
|
<head>
|
||||||
|
<style>
|
||||||
|
table,
|
||||||
|
th,
|
||||||
|
td {
|
||||||
|
border: 1px solid black;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h2>Pivot table with with 1 row header</h2>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Year</th>
|
||||||
|
<th>Month</th>
|
||||||
|
<th>Revenue</th>
|
||||||
|
<th>Cost</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="6">2025</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>January</td>
|
||||||
|
<td>$134</td>
|
||||||
|
<td>$162</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>February</td>
|
||||||
|
<td>$150</td>
|
||||||
|
<td>$155</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>March</td>
|
||||||
|
<td>$160</td>
|
||||||
|
<td>$143</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>April</td>
|
||||||
|
<td>$210</td>
|
||||||
|
<td>$150</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>May</td>
|
||||||
|
<td>$280</td>
|
||||||
|
<td>$120</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<h2>Pivot table with 2 row headers</h2>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Year</th>
|
||||||
|
<th>Quarter</th>
|
||||||
|
<th>Month</th>
|
||||||
|
<th>Revenue</th>
|
||||||
|
<th>Cost</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="7">2025</th>
|
||||||
|
<th rowspan="4">Q1</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>January</td>
|
||||||
|
<td>$134</td>
|
||||||
|
<td>$162</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>February</td>
|
||||||
|
<td>$150</td>
|
||||||
|
<td>$155</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>March</td>
|
||||||
|
<td>$160</td>
|
||||||
|
<td>$143</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="3">Q2</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>April</td>
|
||||||
|
<td>$210</td>
|
||||||
|
<td>$150</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>May</td>
|
||||||
|
<td>$280</td>
|
||||||
|
<td>$120</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<h2>Equivalent pivot table</h2>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Year</th>
|
||||||
|
<th>Quarter</th>
|
||||||
|
<th>Month</th>
|
||||||
|
<th>Revenue</th>
|
||||||
|
<th>Cost</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="8">2025</th>
|
||||||
|
<th rowspan="4">Q1</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>January</td>
|
||||||
|
<td>$134</td>
|
||||||
|
<td>$162</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>February</td>
|
||||||
|
<td>$150</td>
|
||||||
|
<td>$155</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>March</td>
|
||||||
|
<td>$160</td>
|
||||||
|
<td>$143</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="3">Q2</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>April</td>
|
||||||
|
<td>$210</td>
|
||||||
|
<td>$150</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>May</td>
|
||||||
|
<td>$280</td>
|
||||||
|
<td>$120</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
|
||||||
|
</html>
|
Loading…
Reference in New Issue
Block a user