refactor: upgrade BeautifulSoup4 with type hints (#999)

* refactor: upgrade BeautifulSoup4 with type hints

Upgrade dependency library BeautifulSoup4 to 4.13.3 (with type hints).
Refactor backends using BeautifulSoup4 to comply with type hints.
Apply style simplifications and improvements for consistency.
Remove variables and functions that are never used.
Remove code duplication between backends for parsing HTML tables.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* build: allow beautifulsoup4 version 4.12.3

Allow older version of beautifulsoup4 and ensure compatibility.
Update library dependencies.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-02-18 11:30:47 +01:00
committed by GitHub
parent 75db61127c
commit 7450050ace
8 changed files with 328 additions and 425 deletions

View File

@@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
from enum import Enum, unique
from io import BytesIO
from pathlib import Path
from typing import Any, Final, Optional, Union
from typing import Final, Optional, Union
from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
http://oasis-open.org/specs/soextblx.dtd
"""
class ColInfo(TypedDict):
ncols: int
colinfo: list[dict]
class MinColInfoType(TypedDict):
offset: list[int]
colwidth: list[int]
@@ -1425,7 +1429,7 @@ class XmlTable:
self.empty_text = ""
self._soup = BeautifulSoup(input, features="xml")
def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
"""Create a unified range along the table groups.
Args:
@@ -1532,19 +1536,26 @@ class XmlTable:
Returns:
A docling table object.
"""
tgs_align = []
tg_secs = table.find_all("tgroup")
tgs_align: list[XmlTable.ColInfo] = []
tg_secs = table("tgroup")
if tg_secs:
for tg_sec in tg_secs:
ncols = tg_sec.get("cols", None)
if ncols:
ncols = int(ncols)
tg_align = {"ncols": ncols, "colinfo": []}
cs_secs = tg_sec.find_all("colspec")
if not isinstance(tg_sec, Tag):
continue
col_val = tg_sec.get("cols")
ncols = (
int(col_val)
if isinstance(col_val, str) and col_val.isnumeric()
else 1
)
tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
cs_secs = tg_sec("colspec")
if cs_secs:
for cs_sec in cs_secs:
colname = cs_sec.get("colname", None)
colwidth = cs_sec.get("colwidth", None)
if not isinstance(cs_sec, Tag):
continue
colname = cs_sec.get("colname")
colwidth = cs_sec.get("colwidth")
tg_align["colinfo"].append(
{"colname": colname, "colwidth": colwidth}
)
@@ -1565,16 +1576,23 @@ class XmlTable:
table_data: list[TableCell] = []
i_row_global = 0
is_row_empty: bool = True
tg_secs = table.find_all("tgroup")
tg_secs = table("tgroup")
if tg_secs:
for itg, tg_sec in enumerate(tg_secs):
if not isinstance(tg_sec, Tag):
continue
tg_range = tgs_range[itg]
row_secs = tg_sec.find_all(["row", "tr"])
row_secs = tg_sec(["row", "tr"])
if row_secs:
for row_sec in row_secs:
entry_secs = row_sec.find_all(["entry", "td"])
is_header: bool = row_sec.parent.name in ["thead"]
if not isinstance(row_sec, Tag):
continue
entry_secs = row_sec(["entry", "td"])
is_header: bool = (
row_sec.parent is not None
and row_sec.parent.name == "thead"
)
ncols = 0
local_row: list[TableCell] = []
@@ -1582,23 +1600,26 @@ class XmlTable:
if entry_secs:
wrong_nbr_cols = False
for ientry, entry_sec in enumerate(entry_secs):
if not isinstance(entry_sec, Tag):
continue
text = entry_sec.get_text().strip()
# start-end
namest = entry_sec.attrs.get("namest", None)
nameend = entry_sec.attrs.get("nameend", None)
if isinstance(namest, str) and namest.isnumeric():
namest = int(namest)
else:
namest = ientry + 1
namest = entry_sec.get("namest")
nameend = entry_sec.get("nameend")
start = (
int(namest)
if isinstance(namest, str) and namest.isnumeric()
else ientry + 1
)
if isinstance(nameend, str) and nameend.isnumeric():
nameend = int(nameend)
end = int(nameend)
shift = 0
else:
nameend = ientry + 2
end = ientry + 2
shift = 1
if nameend > len(tg_range["cell_offst"]):
if end > len(tg_range["cell_offst"]):
wrong_nbr_cols = True
self.nbr_messages += 1
if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ class XmlTable:
break
range_ = [
tg_range["cell_offst"][namest - 1],
tg_range["cell_offst"][nameend - 1] - shift,
tg_range["cell_offst"][start - 1],
tg_range["cell_offst"][end - 1] - shift,
]
# add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ class XmlTable:
A docling table data.
"""
section = self._soup.find("table")
if section is not None:
if isinstance(section, Tag):
table = self._parse_table(section)
if table.num_rows == 0 or table.num_cols == 0:
_log.warning("The parsed USPTO table is empty")