diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 88a315d..7c71690 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,4 +1,5 @@ import logging +import traceback from io import BytesIO from pathlib import Path from typing import Final, Optional, Union, cast @@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.analyze_tag(cast(Tag, element), doc) except Exception as exc_child: _log.error( - f"Error processing child from tag {tag.name}: {exc_child!r}" + f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}" ) raise exc_child elif isinstance(element, NavigableString) and not isinstance( @@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): _log.debug(f"list-item has no text: {element}") @staticmethod - def parse_table_data(element: Tag) -> Optional[TableData]: + def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901 nested_tables = element.find("table") if nested_tables is not None: _log.debug("Skipping nested table.") return None - # Count the number of rows (number of elements) - num_rows = len(element("tr")) - - # Find the number of columns (taking into account colspan) + # Find the number of rows and columns (taking into account spans) + num_rows = 0 num_cols = 0 for row in element("tr"): col_count = 0 + is_row_header = True if not isinstance(row, Tag): continue for cell in row(["td", "th"]): if not isinstance(row, Tag): continue - val = cast(Tag, cell).get("colspan", "1") + cell_tag = cast(Tag, cell) + val = cell_tag.get("colspan", "1") colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1 col_count += colspan + if cell_tag.name == "td" or cell_tag.get("rowspan") is None: + is_row_header = False num_cols = max(num_cols, col_count) + if not is_row_header: + num_rows += 1 + + _log.debug(f"The table has {num_rows} rows and {num_cols} cols.") grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)] data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) # Iterate over the rows in the table - for row_idx, row in enumerate(element("tr")): + start_row_span = 0 + row_idx = -1 + for row in element("tr"): if not isinstance(row, Tag): continue # For each row, find all the column cells (both and ) cells = row(["td", "th"]) - # Check if each cell in the row is a header -> means it is a column header + # Check if cell is in a column header or row header col_header = True + row_header = True for html_cell in cells: - if isinstance(html_cell, Tag) and html_cell.name == "td": - col_header = False + if isinstance(html_cell, Tag): + if html_cell.name == "td": + col_header = False + row_header = False + elif html_cell.get("rowspan") is None: + row_header = False + if not row_header: + row_idx += 1 + start_row_span = 0 + else: + start_row_span += 1 # Extract the text content of each cell col_idx = 0 @@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if isinstance(row_val, str) and row_val.isnumeric() else 1 ) - - while grid[row_idx][col_idx] is not None: + if row_header: + row_span -= 1 + while ( + col_idx < num_cols + and grid[row_idx + start_row_span][col_idx] is not None + ): col_idx += 1 - for r in range(row_span): + for r in range(start_row_span, start_row_span + row_span): for c in range(col_span): - grid[row_idx + r][col_idx + c] = text + if row_idx + r < num_rows and col_idx + c < num_cols: + grid[row_idx + r][col_idx + c] = text table_cell = TableCell( text=text, row_span=row_span, col_span=col_span, - start_row_offset_idx=row_idx, - end_row_offset_idx=row_idx + row_span, + start_row_offset_idx=start_row_span + row_idx, + end_row_offset_idx=start_row_span + row_idx + row_span, start_col_offset_idx=col_idx, end_col_offset_idx=col_idx + col_span, column_header=col_header, diff --git a/tests/data/html/example_8.html b/tests/data/html/example_8.html new file mode 100644 index 0000000..bcd771b --- /dev/null +++ b/tests/data/html/example_8.html @@ -0,0 +1,145 @@ + + + + + + + + + +

Pivot table with with 1 row header

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
YearMonthRevenueCost
2025
January$134$162
February$150$155
March$160$143
April$210$150
May$280$120
+ +

Pivot table with 2 row headers

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
YearQuarterMonthRevenueCost
2025Q1
January$134$162
February$150$155
March$160$143
Q2
April$210$150
May$280$120
+ +

Equivalent pivot table

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
YearQuarterMonthRevenueCost
2025Q1
January$134$162
February$150$155
March$160$143
Q2
April$210$150
May$280$120
+ + + + \ No newline at end of file