refactor: upgrade BeautifulSoup4 with type hints (#999)
* refactor: upgrade BeautifulSoup4 with type hints Upgrade dependency library BeautifulSoup4 to 4.13.3 (with type hints). Refactor backends using BeautifulSoup4 to comply with type hints. Apply style simplifications and improvements for consistency. Remove variables and functions that are never used. Remove code duplication between backends for parsing HTML tables. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * build: allow beautifulsoup4 version 4.12.3 Allow older version of beautifulsoup4 and ensure compatibility. Update library dependencies. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
75db61127c
commit
7450050ace
@@ -4,7 +4,7 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Optional, Union
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
@@ -12,14 +12,13 @@ from docling_core.types.doc import (
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from lxml import etree
|
||||
from typing_extensions import TypedDict, override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
) -> None:
|
||||
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
|
||||
table_tag = soup.find("table")
|
||||
|
||||
nested_tables = table_tag.find("table")
|
||||
if nested_tables:
|
||||
_log.warning(f"Skipping nested table in {str(self.file)}")
|
||||
if not isinstance(table_tag, Tag):
|
||||
return
|
||||
|
||||
# Count the number of rows (number of <tr> elements)
|
||||
num_rows = len(table_tag.find_all("tr"))
|
||||
|
||||
# Find the number of columns (taking into account colspan)
|
||||
num_cols = 0
|
||||
for row in table_tag.find_all("tr"):
|
||||
col_count = 0
|
||||
for cell in row.find_all(["td", "th"]):
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
col_count += colspan
|
||||
num_cols = max(num_cols, col_count)
|
||||
|
||||
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
# Iterate over the rows in the table
|
||||
for row_idx, row in enumerate(table_tag.find_all("tr")):
|
||||
# For each row, find all the column cells (both <td> and <th>)
|
||||
cells = row.find_all(["td", "th"])
|
||||
|
||||
# Check if each cell in the row is a header -> means it is a column header
|
||||
col_header = True
|
||||
for j, html_cell in enumerate(cells):
|
||||
if html_cell.name == "td":
|
||||
col_header = False
|
||||
|
||||
# Extract and print the text content of each cell
|
||||
col_idx = 0
|
||||
for _, html_cell in enumerate(cells):
|
||||
# extract inline formulas
|
||||
for formula in html_cell.find_all("inline-formula"):
|
||||
math_parts = formula.text.split("$$")
|
||||
if len(math_parts) == 3:
|
||||
math_formula = f"$${math_parts[1]}$$"
|
||||
formula.replaceWith(math_formula)
|
||||
text = html_cell.text
|
||||
|
||||
col_span = int(html_cell.get("colspan", 1))
|
||||
row_span = int(html_cell.get("rowspan", 1))
|
||||
|
||||
while grid[row_idx][col_idx] is not None:
|
||||
col_idx += 1
|
||||
for r in range(row_span):
|
||||
for c in range(col_span):
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
|
||||
cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
)
|
||||
data.table_cells.append(cell)
|
||||
data = HTMLDocumentBackend.parse_table_data(table_tag)
|
||||
|
||||
# TODO: format label vs caption once styling is supported
|
||||
label = table_xml_component["label"]
|
||||
@@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
else None
|
||||
)
|
||||
|
||||
doc.add_table(data=data, parent=parent, caption=table_caption)
|
||||
if data is not None:
|
||||
doc.add_table(data=data, parent=parent, caption=table_caption)
|
||||
|
||||
return
|
||||
|
||||
@@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _walk_linear(
|
||||
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
||||
) -> str:
|
||||
# _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
|
||||
skip_tags = ["term"]
|
||||
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
|
||||
new_parent: NodeItem = parent
|
||||
|
||||
Reference in New Issue
Block a user