diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 286dfbf..234e5da 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,9 +1,9 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Optional, Set, Union
+from typing import Optional, Union, cast
 
-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
@@ -12,6 +12,7 @@ from docling_core.types.doc import (
     TableCell,
     TableData,
 )
+from typing_extensions import override
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -21,6 +22,7 @@ _log = logging.getLogger(__name__)
 
 
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
+    @override
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
         _log.debug("About to init HTML backend...")
@@ -48,13 +50,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 f"Could not initialize HTML backend for file with hash {self.document_hash}."
             ) from e
 
+    @override
     def is_valid(self) -> bool:
         return self.soup is not None
 
     @classmethod
+    @override
     def supports_pagination(cls) -> bool:
         return False
 
+    @override
     def unload(self):
         if isinstance(self.path_or_stream, BytesIO):
             self.path_or_stream.close()
@@ -62,9 +67,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.path_or_stream = None
 
     @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.HTML}
 
+    @override
     def convert(self) -> DoclingDocument:
         # access self.path_or_stream to load stuff
         origin = DocumentOrigin(
@@ -80,98 +87,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             assert self.soup is not None
             content = self.soup.body or self.soup
             # Replace <br> tags with newline characters
-            for br in content.find_all("br"):
-                br.replace_with("\n")
-            doc = self.walk(content, doc)
+            for br in content("br"):
+                br.replace_with(NavigableString("\n"))
+            self.walk(content, doc)
         else:
             raise RuntimeError(
                 f"Cannot convert doc with {self.document_hash} because the backend failed to init."
             )
         return doc
 
-    def walk(self, element: Tag, doc: DoclingDocument):
-        try:
-            # Iterate over elements in the body of the document
-            for idx, element in enumerate(element.children):
+    def walk(self, tag: Tag, doc: DoclingDocument) -> None:
+        # Iterate over elements in the body of the document
+        for element in tag.children:
+            if isinstance(element, Tag):
                 try:
-                    self.analyse_element(element, idx, doc)
+                    self.analyze_tag(cast(Tag, element), doc)
                 except Exception as exc_child:
-
-                    _log.error(" -> error treating child: ", exc_child)
-                    _log.error(" => element: ", element, "\n")
+                    _log.error(
+                        f"Error processing child from tag{tag.name}: {exc_child}"
+                    )
                     raise exc_child
 
-        except Exception as exc:
-            pass
+        return
 
-        return doc
-
-    def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
-        """
-        if element.name!=None:
-            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
-        """
-
-        if element.name in self.labels:
-            self.labels[element.name] += 1
+    def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
+        if tag.name in self.labels:
+            self.labels[tag.name] += 1
         else:
-            self.labels[element.name] = 1
+            self.labels[tag.name] = 1
 
-        if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            self.handle_header(element, idx, doc)
-        elif element.name in ["p"]:
-            self.handle_paragraph(element, idx, doc)
-        elif element.name in ["pre"]:
-            self.handle_code(element, idx, doc)
-        elif element.name in ["ul", "ol"]:
-            self.handle_list(element, idx, doc)
-        elif element.name in ["li"]:
-            self.handle_listitem(element, idx, doc)
-        elif element.name == "table":
-            self.handle_table(element, idx, doc)
-        elif element.name == "figure":
-            self.handle_figure(element, idx, doc)
-        elif element.name == "img":
-            self.handle_image(element, idx, doc)
+        if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            self.handle_header(tag, doc)
+        elif tag.name in ["p"]:
+            self.handle_paragraph(tag, doc)
+        elif tag.name in ["pre"]:
+            self.handle_code(tag, doc)
+        elif tag.name in ["ul", "ol"]:
+            self.handle_list(tag, doc)
+        elif tag.name in ["li"]:
+            self.handle_list_item(tag, doc)
+        elif tag.name == "table":
+            self.handle_table(tag, doc)
+        elif tag.name == "figure":
+            self.handle_figure(tag, doc)
+        elif tag.name == "img":
+            self.handle_image(doc)
         else:
-            self.walk(element, doc)
+            self.walk(tag, doc)
 
-    def get_direct_text(self, item: Tag):
-        """Get the direct text of the <li> element (ignoring nested lists)."""
-        text = item.find(string=True, recursive=False)
-        if isinstance(text, str):
-            return text.strip()
+    def get_text(self, item: PageElement) -> str:
+        """Get the text content of a tag."""
+        parts: list[str] = self.extract_text_recursively(item)
 
-        return ""
+        return "".join(parts) + " "
 
     # Function to recursively extract text from all child nodes
-    def extract_text_recursively(self, item: Tag):
-        result = []
+    def extract_text_recursively(self, item: PageElement) -> list[str]:
+        result: list[str] = []
 
-        if isinstance(item, str):
+        if isinstance(item, NavigableString):
             return [item]
 
-        if item.name not in ["ul", "ol"]:
-            try:
-                # Iterate over the children (and their text and tails)
-                for child in item:
-                    try:
-                        # Recursively get the child's text content
-                        result.extend(self.extract_text_recursively(child))
-                    except:
-                        pass
-            except:
-                _log.warn("item has no children")
-                pass
+        tag = cast(Tag, item)
+        if tag.name not in ["ul", "ol"]:
+            for child in tag:
+                # Recursively get the child's text content
+                result.extend(self.extract_text_recursively(child))
 
-        return "".join(result) + " "
+        return ["".join(result) + " "]
 
-    def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles header tags (h1, h2, etc.)."""
         hlevel = int(element.name.replace("h", ""))
-        slevel = hlevel - 1
-
-        label = DocItemLabel.SECTION_HEADER
         text = element.text.strip()
 
         if hlevel == 1:
@@ -197,7 +184,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             elif hlevel < self.level:
 
                 # remove the tail
-                for key, val in self.parents.items():
+                for key in self.parents.keys():
                     if key > hlevel:
                         self.parents[key] = None
                 self.level = hlevel
@@ -208,27 +195,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 level=hlevel,
             )
 
-    def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles monospace code snippets (pre)."""
         if element.text is None:
             return
         text = element.text.strip()
-        label = DocItemLabel.CODE
-        if len(text) == 0:
-            return
-        doc.add_code(parent=self.parents[self.level], text=text)
+        if text:
+            doc.add_code(parent=self.parents[self.level], text=text)
 
-    def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles paragraph tags (p)."""
         if element.text is None:
             return
         text = element.text.strip()
         label = DocItemLabel.PARAGRAPH
-        if len(text) == 0:
-            return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        if text:
+            doc.add_text(parent=self.parents[self.level], label=label, text=text)
 
-    def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles list tags (ul, ol) and their list items."""
 
         if element.name == "ul":
@@ -250,18 +234,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.parents[self.level + 1] = None
         self.level -= 1
 
-    def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles listitem tags (li)."""
-        nested_lists = element.find(["ul", "ol"])
+        nested_list = element.find(["ul", "ol"])
 
         parent_list_label = self.parents[self.level].label
         index_in_list = len(self.parents[self.level].children) + 1
 
-        if nested_lists:
-            name = element.name
+        if nested_list:
             # Text in list item can be hidden within hierarchy, hence
             # we need to extract it recursively
-            text = self.extract_text_recursively(element)
+            text: str = self.get_text(element)
             # Flatten text, remove break lines:
             text = text.replace("\n", "").replace("\r", "")
             text = " ".join(text.split()).strip()
@@ -287,7 +270,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[self.level + 1] = None
             self.level -= 1
 
-        elif isinstance(element.text, str):
+        elif element.text.strip():
             text = element.text.strip()
 
             marker = ""
@@ -302,59 +285,79 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 parent=self.parents[self.level],
             )
         else:
-            _log.warn("list-item has no text: ", element)
-
-    def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles table tags."""
+            _log.warning(f"list-item has no text: {element}")
 
+    @staticmethod
+    def parse_table_data(element: Tag) -> Optional[TableData]:
         nested_tables = element.find("table")
         if nested_tables is not None:
-            _log.warn("detected nested tables: skipping for now")
-            return
+            _log.warning("Skipping nested table.")
+            return None
 
         # Count the number of rows (number of <tr> elements)
-        num_rows = len(element.find_all("tr"))
+        num_rows = len(element("tr"))
 
         # Find the number of columns (taking into account colspan)
         num_cols = 0
-        for row in element.find_all("tr"):
+        for row in element("tr"):
             col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
+            if not isinstance(row, Tag):
+                continue
+            for cell in row(["td", "th"]):
+                if not isinstance(row, Tag):
+                    continue
+                val = cast(Tag, cell).get("colspan", "1")
+                colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
                 col_count += colspan
             num_cols = max(num_cols, col_count)
 
-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+        grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
 
         data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
 
         # Iterate over the rows in the table
-        for row_idx, row in enumerate(element.find_all("tr")):
+        for row_idx, row in enumerate(element("tr")):
+            if not isinstance(row, Tag):
+                continue
 
             # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
+            cells = row(["td", "th"])
 
             # Check if each cell in the row is a header -> means it is a column header
             col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
+            for html_cell in cells:
+                if isinstance(html_cell, Tag) and html_cell.name == "td":
                     col_header = False
 
+            # Extract the text content of each cell
             col_idx = 0
-            # Extract and print the text content of each cell
-            for _, html_cell in enumerate(cells):
+            for html_cell in cells:
+                if not isinstance(html_cell, Tag):
+                    continue
 
+                # extract inline formulas
+                for formula in html_cell("inline-formula"):
+                    math_parts = formula.text.split("$$")
+                    if len(math_parts) == 3:
+                        math_formula = f"$${math_parts[1]}$$"
+                        formula.replace_with(NavigableString(math_formula))
+
+                # TODO: extract content correctly from table-cells with lists
                 text = html_cell.text
-                try:
-                    text = self.extract_table_cell_text(html_cell)
-                except Exception as exc:
-                    _log.warn("exception: ", exc)
-                    exit(-1)
 
                 # label = html_cell.name
-
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
+                col_val = html_cell.get("colspan", "1")
+                col_span = (
+                    int(col_val)
+                    if isinstance(col_val, str) and col_val.isnumeric()
+                    else 1
+                )
+                row_val = html_cell.get("rowspan", "1")
+                row_span = (
+                    int(row_val)
+                    if isinstance(row_val, str) and row_val.isnumeric()
+                    else 1
+                )
 
                 while grid[row_idx][col_idx] is not None:
                     col_idx += 1
@@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     for c in range(col_span):
                         grid[row_idx + r][col_idx + c] = text
 
-                cell = TableCell(
+                table_cell = TableCell(
                     text=text,
                     row_span=row_span,
                     col_span=col_span,
@@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     col_header=col_header,
                     row_header=((not col_header) and html_cell.name == "th"),
                 )
-                data.table_cells.append(cell)
+                data.table_cells.append(table_cell)
 
-        doc.add_table(data=data, parent=self.parents[self.level])
+        return data
 
-    def get_list_text(self, list_element: Tag, level=0):
+    def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
+        """Handles table tags."""
+
+        table_data = HTMLDocumentBackend.parse_table_data(element)
+
+        if table_data is not None:
+            doc.add_table(data=table_data, parent=self.parents[self.level])
+
+    def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
         """Recursively extract text from <ul> or <ol> with proper indentation."""
         result = []
         bullet_char = "*"  # Default bullet character for unordered lists
 
         if list_element.name == "ol":  # For ordered lists, use numbers
-            for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
+            for i, li in enumerate(list_element("li", recursive=False), 1):
+                if not isinstance(li, Tag):
+                    continue
                 # Add numbering for ordered lists
                 result.append(f"{'    ' * level}{i}. {li.get_text(strip=True)}")
                 # Handle nested lists
                 nested_list = li.find(["ul", "ol"])
-                if nested_list:
+                if isinstance(nested_list, Tag):
                     result.extend(self.get_list_text(nested_list, level + 1))
         elif list_element.name == "ul":  # For unordered lists, use bullet points
-            for li in list_element.find_all("li", recursive=False):
+            for li in list_element("li", recursive=False):
+                if not isinstance(li, Tag):
+                    continue
                 # Add bullet points for unordered lists
                 result.append(
                     f"{'    ' * level}{bullet_char} {li.get_text(strip=True)}"
                 )
                 # Handle nested lists
                 nested_list = li.find(["ul", "ol"])
-                if nested_list:
+                if isinstance(nested_list, Tag):
                     result.extend(self.get_list_text(nested_list, level + 1))
 
         return result
 
-    def extract_table_cell_text(self, cell: Tag):
-        """Extract text from a table cell, including lists with indents."""
-        contains_lists = cell.find(["ul", "ol"])
-        if contains_lists is None:
-            return cell.text
-        else:
-            _log.debug(
-                "should extract the content correctly for table-cells with lists ..."
-            )
-            return cell.text
-
-    def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
+    def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles image tags (img)."""
 
         # Extract the image URI from the <img> tag
         # image_uri = root.xpath('//figure//img/@src')[0]
 
         contains_captions = element.find(["figcaption"])
-        if contains_captions is None:
+        if not isinstance(contains_captions, Tag):
             doc.add_picture(parent=self.parents[self.level], caption=None)
-
         else:
             texts = []
             for item in contains_captions:
@@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 caption=fig_caption,
             )
 
-    def handle_image(self, element: Tag, idx, doc: DoclingDocument):
+    def handle_image(self, doc: DoclingDocument) -> None:
         """Handles image tags (img)."""
         doc.add_picture(parent=self.parents[self.level], caption=None)
diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py
index 1d7091c..2409961 100755
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@@ -4,7 +4,7 @@ from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
@@ -12,14 +12,13 @@ from docling_core.types.doc import (
     GroupItem,
     GroupLabel,
     NodeItem,
-    TableCell,
-    TableData,
     TextItem,
 )
 from lxml import etree
 from typing_extensions import TypedDict, override
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 
@@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
     ) -> None:
         soup = BeautifulSoup(table_xml_component["content"], "html.parser")
         table_tag = soup.find("table")
-
-        nested_tables = table_tag.find("table")
-        if nested_tables:
-            _log.warning(f"Skipping nested table in {str(self.file)}")
+        if not isinstance(table_tag, Tag):
             return
 
-        # Count the number of rows (number of <tr> elements)
-        num_rows = len(table_tag.find_all("tr"))
-
-        # Find the number of columns (taking into account colspan)
-        num_cols = 0
-        for row in table_tag.find_all("tr"):
-            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
-                col_count += colspan
-            num_cols = max(num_cols, col_count)
-
-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-
-        # Iterate over the rows in the table
-        for row_idx, row in enumerate(table_tag.find_all("tr")):
-            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
-
-            # Check if each cell in the row is a header -> means it is a column header
-            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
-                    col_header = False
-
-            # Extract and print the text content of each cell
-            col_idx = 0
-            for _, html_cell in enumerate(cells):
-                # extract inline formulas
-                for formula in html_cell.find_all("inline-formula"):
-                    math_parts = formula.text.split("$$")
-                    if len(math_parts) == 3:
-                        math_formula = f"$${math_parts[1]}$$"
-                        formula.replaceWith(math_formula)
-                text = html_cell.text
-
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
-
-                while grid[row_idx][col_idx] is not None:
-                    col_idx += 1
-                for r in range(row_span):
-                    for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
-
-                cell = TableCell(
-                    text=text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
-                    col_header=col_header,
-                    row_header=((not col_header) and html_cell.name == "th"),
-                )
-                data.table_cells.append(cell)
+        data = HTMLDocumentBackend.parse_table_data(table_tag)
 
         # TODO: format label vs caption once styling is supported
         label = table_xml_component["label"]
@@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
             else None
         )
 
-        doc.add_table(data=data, parent=parent, caption=table_caption)
+        if data is not None:
+            doc.add_table(data=data, parent=parent, caption=table_caption)
 
         return
 
@@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
     def _walk_linear(
         self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
     ) -> str:
-        # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
         skip_tags = ["term"]
         flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
         new_parent: NodeItem = parent
diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py
index 21001ab..cf23e04 100644
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
 from enum import Enum, unique
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Final, Optional, Union
+from typing import Final, Optional, Union
 
 from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
     http://oasis-open.org/specs/soextblx.dtd
     """
 
+    class ColInfo(TypedDict):
+        ncols: int
+        colinfo: list[dict]
+
     class MinColInfoType(TypedDict):
         offset: list[int]
         colwidth: list[int]
@@ -1425,7 +1429,7 @@ class XmlTable:
         self.empty_text = ""
         self._soup = BeautifulSoup(input, features="xml")
 
-    def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
+    def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
         """Create a unified range along the table groups.
 
         Args:
@@ -1532,19 +1536,26 @@ class XmlTable:
         Returns:
             A docling table object.
         """
-        tgs_align = []
-        tg_secs = table.find_all("tgroup")
+        tgs_align: list[XmlTable.ColInfo] = []
+        tg_secs = table("tgroup")
         if tg_secs:
             for tg_sec in tg_secs:
-                ncols = tg_sec.get("cols", None)
-                if ncols:
-                    ncols = int(ncols)
-                tg_align = {"ncols": ncols, "colinfo": []}
-                cs_secs = tg_sec.find_all("colspec")
+                if not isinstance(tg_sec, Tag):
+                    continue
+                col_val = tg_sec.get("cols")
+                ncols = (
+                    int(col_val)
+                    if isinstance(col_val, str) and col_val.isnumeric()
+                    else 1
+                )
+                tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
+                cs_secs = tg_sec("colspec")
                 if cs_secs:
                     for cs_sec in cs_secs:
-                        colname = cs_sec.get("colname", None)
-                        colwidth = cs_sec.get("colwidth", None)
+                        if not isinstance(cs_sec, Tag):
+                            continue
+                        colname = cs_sec.get("colname")
+                        colwidth = cs_sec.get("colwidth")
                         tg_align["colinfo"].append(
                             {"colname": colname, "colwidth": colwidth}
                         )
@@ -1565,16 +1576,23 @@ class XmlTable:
         table_data: list[TableCell] = []
         i_row_global = 0
         is_row_empty: bool = True
-        tg_secs = table.find_all("tgroup")
+        tg_secs = table("tgroup")
         if tg_secs:
             for itg, tg_sec in enumerate(tg_secs):
+                if not isinstance(tg_sec, Tag):
+                    continue
                 tg_range = tgs_range[itg]
-                row_secs = tg_sec.find_all(["row", "tr"])
+                row_secs = tg_sec(["row", "tr"])
 
                 if row_secs:
                     for row_sec in row_secs:
-                        entry_secs = row_sec.find_all(["entry", "td"])
-                        is_header: bool = row_sec.parent.name in ["thead"]
+                        if not isinstance(row_sec, Tag):
+                            continue
+                        entry_secs = row_sec(["entry", "td"])
+                        is_header: bool = (
+                            row_sec.parent is not None
+                            and row_sec.parent.name == "thead"
+                        )
 
                         ncols = 0
                         local_row: list[TableCell] = []
@@ -1582,23 +1600,26 @@ class XmlTable:
                         if entry_secs:
                             wrong_nbr_cols = False
                             for ientry, entry_sec in enumerate(entry_secs):
+                                if not isinstance(entry_sec, Tag):
+                                    continue
                                 text = entry_sec.get_text().strip()
 
                                 # start-end
-                                namest = entry_sec.attrs.get("namest", None)
-                                nameend = entry_sec.attrs.get("nameend", None)
-                                if isinstance(namest, str) and namest.isnumeric():
-                                    namest = int(namest)
-                                else:
-                                    namest = ientry + 1
+                                namest = entry_sec.get("namest")
+                                nameend = entry_sec.get("nameend")
+                                start = (
+                                    int(namest)
+                                    if isinstance(namest, str) and namest.isnumeric()
+                                    else ientry + 1
+                                )
                                 if isinstance(nameend, str) and nameend.isnumeric():
-                                    nameend = int(nameend)
+                                    end = int(nameend)
                                     shift = 0
                                 else:
-                                    nameend = ientry + 2
+                                    end = ientry + 2
                                     shift = 1
 
-                                if nameend > len(tg_range["cell_offst"]):
+                                if end > len(tg_range["cell_offst"]):
                                     wrong_nbr_cols = True
                                     self.nbr_messages += 1
                                     if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ class XmlTable:
                                     break
 
                                 range_ = [
-                                    tg_range["cell_offst"][namest - 1],
-                                    tg_range["cell_offst"][nameend - 1] - shift,
+                                    tg_range["cell_offst"][start - 1],
+                                    tg_range["cell_offst"][end - 1] - shift,
                                 ]
 
                                 # add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ class XmlTable:
             A docling table data.
         """
         section = self._soup.find("table")
-        if section is not None:
+        if isinstance(section, Tag):
             table = self._parse_table(section)
             if table.num_rows == 0 or table.num_cols == 0:
                 _log.warning("The parsed USPTO table is empty")
diff --git a/poetry.lock b/poetry.lock
index f1887d7..329e4ae 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -282,17 +282,18 @@ testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-ch
 
 [[package]]
 name = "beautifulsoup4"
-version = "4.12.3"
+version = "4.13.3"
 description = "Screen-scraping library"
 optional = false
-python-versions = ">=3.6.0"
+python-versions = ">=3.7.0"
 files = [
-    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
-    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+    {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
+    {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
 ]
 
 [package.dependencies]
 soupsieve = ">1.2"
+typing-extensions = ">=4.0.0"
 
 [package.extras]
 cchardet = ["cchardet"]
@@ -866,13 +867,13 @@ files = [
 
 [[package]]
 name = "docling-core"
-version = "2.19.0"
+version = "2.19.1"
 description = "A python library to define and validate data types in Docling."
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_core-2.19.0-py3-none-any.whl", hash = "sha256:caa1e13d98fa9a00608091c386609c75b3560c7291e842c252f0b6f8d5812dbd"},
-    {file = "docling_core-2.19.0.tar.gz", hash = "sha256:ebf3062e31155bb5f0e6132056a2d239a0e6e693a75c5758886909bb9fef461a"},
+    {file = "docling_core-2.19.1-py3-none-any.whl", hash = "sha256:ca7bd4dacd75611c5ea4f205192b71a8f22205e615eff1a16aac7082644d3b2e"},
+    {file = "docling_core-2.19.1.tar.gz", hash = "sha256:e2769b816c669cdf27024dd3b219d3ecaf2161691dd5e8e5e8ce439557ea0928"},
 ]
 
 [package.dependencies]
@@ -1357,13 +1358,13 @@ colorama = ">=0.4"
 
 [[package]]
 name = "griffe-pydantic"
-version = "1.1.0"
+version = "1.1.2"
 description = "Griffe extension for Pydantic."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"},
-    {file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"},
+    {file = "griffe_pydantic-1.1.2-py3-none-any.whl", hash = "sha256:8ad53218ca6e9c24ccec83588eb435f562b30355f641fe336e81b1e00ea05f3c"},
+    {file = "griffe_pydantic-1.1.2.tar.gz", hash = "sha256:381eacd8854a85811522b4f6dc9a1ef0fb5931825081379d70ff3a425b0d4ea1"},
 ]
 
 [package.dependencies]
@@ -7052,18 +7053,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
 
 [[package]]
 name = "transformers"
-version = "4.48.3"
+version = "4.49.0"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.9.0"
 files = [
-    {file = "transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36"},
-    {file = "transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb"},
+    {file = "transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03"},
+    {file = "transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e"},
 ]
 
 [package.dependencies]
 filelock = "*"
-huggingface-hub = ">=0.24.0,<1.0"
+huggingface-hub = ">=0.26.0,<1.0"
 numpy = ">=1.17"
 packaging = ">=20.0"
 pyyaml = ">=5.1"
@@ -7076,13 +7077,13 @@ tqdm = ">=4.27"
 [package.extras]
 accelerate = ["accelerate (>=0.26.0)"]
 agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 benchmark = ["optimum-benchmark (>=0.3.0)"]
 codecarbon = ["codecarbon (>=2.8.1)"]
 deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
 deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
 dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
@@ -7115,8 +7116,8 @@ tokenizers = ["tokenizers (>=0.21,<0.22)"]
 torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)"]
+torchhub = ["filelock", "huggingface-hub (>=0.26.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
+video = ["av"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]
 
 [[package]]
@@ -7841,4 +7842,4 @@ vlm = ["transformers", "transformers"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "2cca8bac31dd535e36045cf2f5f0380852c34f6bafad78834144d6ca56d2d79c"
+content-hash = "63f9271160d39cac74fa3fc959dbb0f91530d76a693c69d81ced006477d04315"
diff --git a/pyproject.toml b/pyproject.toml
index 6b61da8..0c04acf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ scipy = [
 typer = "^0.12.5"
 python-docx = "^1.1.2"
 python-pptx = "^1.0.2"
-beautifulsoup4 = ">=4.12.3,<4.13.0"
+beautifulsoup4 = "^4.12.3"
 pandas = "^2.1.4"
 marko = "^2.1.2"
 openpyxl = "^3.1.5"
@@ -166,7 +166,6 @@ module = [
     "ocrmac.*",
     "deepsearch_glm.*",
     "lxml.*",
-    "bs4.*",
     "huggingface_hub.*",
     "transformers.*",
 ]
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
index 2d4a316..3ae39e8 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
@@ -410,68 +410,65 @@ item-0 at level 0: unspecified: group _root_
       item-396 at level 3: list: group list
         item-397 at level 4: list_item: list of books (useful looking abstracts)
         item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
-        item-399 at level 4: list_item: 
-        item-400 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
-      item-401 at level 3: table with [3x2]
-      item-402 at level 3: picture
-      item-403 at level 3: list: group list
-        item-404 at level 4: list_item: Ducks
-        item-405 at level 4: list_item: Game birds
-        item-406 at level 4: list_item: Bird common names
-      item-407 at level 3: list: group list
-        item-408 at level 4: list_item: All accuracy disputes
-        item-409 at level 4: list_item: Accuracy disputes from February 2020
-        item-410 at level 4: list_item: CS1 Finnish-language sources (fi)
-        item-411 at level 4: list_item: CS1 Latvian-language sources (lv)
-        item-412 at level 4: list_item: CS1 Swedish-language sources (sv)
-        item-413 at level 4: list_item: Articles with short description
-        item-414 at level 4: list_item: Short description is different from Wikidata
-        item-415 at level 4: list_item: Wikipedia indefinitely move-protected pages
-        item-416 at level 4: list_item: Wikipedia indefinitely semi-protected pages
-        item-417 at level 4: list_item: Articles with 'species' microformats
-        item-418 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
-        item-419 at level 4: list_item: Articles containing Dutch-language text
-        item-420 at level 4: list_item: Articles containing German-language text
-        item-421 at level 4: list_item: Articles containing Norwegian-language text
-        item-422 at level 4: list_item: Articles containing Lithuanian-language text
-        item-423 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
-        item-424 at level 4: list_item: All articles with self-published sources
-        item-425 at level 4: list_item: Articles with self-published sources from February 2020
-        item-426 at level 4: list_item: All articles with unsourced statements
-        item-427 at level 4: list_item: Articles with unsourced statements from January 2022
-        item-428 at level 4: list_item: CS1: long volume value
-        item-429 at level 4: list_item: Pages using Sister project links with wikidata mismatch
-        item-430 at level 4: list_item: Pages using Sister project links with hidden wikidata
-        item-431 at level 4: list_item: Webarchive template wayback links
-        item-432 at level 4: list_item: Articles with Project Gutenberg links
-        item-433 at level 4: list_item: Articles containing video clips
-      item-434 at level 3: list: group list
-        item-435 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
-        item-436 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
-      item-437 at level 3: list: group list
-        item-438 at level 4: list_item: Privacy policy
-        item-439 at level 4: list_item: About Wikipedia
-        item-440 at level 4: list_item: Disclaimers
-        item-441 at level 4: list_item: Contact Wikipedia
-        item-442 at level 4: list_item: Code of Conduct
-        item-443 at level 4: list_item: Developers
-        item-444 at level 4: list_item: Statistics
-        item-445 at level 4: list_item: Cookie statement
-        item-446 at level 4: list_item: Mobile view
+        item-399 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
+      item-400 at level 3: table with [3x2]
+      item-401 at level 3: picture
+      item-402 at level 3: list: group list
+        item-403 at level 4: list_item: Ducks
+        item-404 at level 4: list_item: Game birds
+        item-405 at level 4: list_item: Bird common names
+      item-406 at level 3: list: group list
+        item-407 at level 4: list_item: All accuracy disputes
+        item-408 at level 4: list_item: Accuracy disputes from February 2020
+        item-409 at level 4: list_item: CS1 Finnish-language sources (fi)
+        item-410 at level 4: list_item: CS1 Latvian-language sources (lv)
+        item-411 at level 4: list_item: CS1 Swedish-language sources (sv)
+        item-412 at level 4: list_item: Articles with short description
+        item-413 at level 4: list_item: Short description is different from Wikidata
+        item-414 at level 4: list_item: Wikipedia indefinitely move-protected pages
+        item-415 at level 4: list_item: Wikipedia indefinitely semi-protected pages
+        item-416 at level 4: list_item: Articles with 'species' microformats
+        item-417 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
+        item-418 at level 4: list_item: Articles containing Dutch-language text
+        item-419 at level 4: list_item: Articles containing German-language text
+        item-420 at level 4: list_item: Articles containing Norwegian-language text
+        item-421 at level 4: list_item: Articles containing Lithuanian-language text
+        item-422 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
+        item-423 at level 4: list_item: All articles with self-published sources
+        item-424 at level 4: list_item: Articles with self-published sources from February 2020
+        item-425 at level 4: list_item: All articles with unsourced statements
+        item-426 at level 4: list_item: Articles with unsourced statements from January 2022
+        item-427 at level 4: list_item: CS1: long volume value
+        item-428 at level 4: list_item: Pages using Sister project links with wikidata mismatch
+        item-429 at level 4: list_item: Pages using Sister project links with hidden wikidata
+        item-430 at level 4: list_item: Webarchive template wayback links
+        item-431 at level 4: list_item: Articles with Project Gutenberg links
+        item-432 at level 4: list_item: Articles containing video clips
+      item-433 at level 3: list: group list
+        item-434 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
+        item-435 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
+      item-436 at level 3: list: group list
+        item-437 at level 4: list_item: Privacy policy
+        item-438 at level 4: list_item: About Wikipedia
+        item-439 at level 4: list_item: Disclaimers
+        item-440 at level 4: list_item: Contact Wikipedia
+        item-441 at level 4: list_item: Code of Conduct
+        item-442 at level 4: list_item: Developers
+        item-443 at level 4: list_item: Statistics
+        item-444 at level 4: list_item: Cookie statement
+        item-445 at level 4: list_item: Mobile view
+      item-446 at level 3: list: group list
       item-447 at level 3: list: group list
-        item-448 at level 4: list_item: 
-        item-449 at level 4: list_item: 
-      item-450 at level 3: list: group list
-  item-451 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
-  item-452 at level 1: caption: Male mallard.
-  item-453 at level 1: caption: Wood ducks.
-  item-454 at level 1: caption: Mallard landing in approach
-  item-455 at level 1: caption: Male Mandarin duck
-  item-456 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
-  item-457 at level 1: caption: Female mallard in Cornwall, England
-  item-458 at level 1: caption: Pecten along the bill
-  item-459 at level 1: caption: Mallard duckling preening
-  item-460 at level 1: caption: A Muscovy duckling
-  item-461 at level 1: caption: Ringed teal
-  item-462 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
-  item-463 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
\ No newline at end of file
+  item-448 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
+  item-449 at level 1: caption: Male mallard.
+  item-450 at level 1: caption: Wood ducks.
+  item-451 at level 1: caption: Mallard landing in approach
+  item-452 at level 1: caption: Male Mandarin duck
+  item-453 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
+  item-454 at level 1: caption: Female mallard in Cornwall, England
+  item-455 at level 1: caption: Pecten along the bill
+  item-456 at level 1: caption: Mallard duckling preening
+  item-457 at level 1: caption: A Muscovy duckling
+  item-458 at level 1: caption: Ringed teal
+  item-459 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
+  item-460 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
index 196c903..e59c18f 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
@@ -1413,9 +1413,6 @@
         },
         {
           "$ref": "#/texts/350"
-        },
-        {
-          "$ref": "#/texts/351"
         }
       ],
       "content_layer": "body",
@@ -1428,14 +1425,14 @@
         "$ref": "#/texts/341"
       },
       "children": [
+        {
+          "$ref": "#/texts/351"
+        },
         {
           "$ref": "#/texts/352"
         },
         {
           "$ref": "#/texts/353"
-        },
-        {
-          "$ref": "#/texts/354"
         }
       ],
       "content_layer": "body",
@@ -1448,6 +1445,9 @@
         "$ref": "#/texts/341"
       },
       "children": [
+        {
+          "$ref": "#/texts/354"
+        },
         {
           "$ref": "#/texts/355"
         },
@@ -1522,9 +1522,6 @@
         },
         {
           "$ref": "#/texts/379"
-        },
-        {
-          "$ref": "#/texts/380"
         }
       ],
       "content_layer": "body",
@@ -1538,10 +1535,10 @@
       },
       "children": [
         {
-          "$ref": "#/texts/381"
+          "$ref": "#/texts/380"
         },
         {
-          "$ref": "#/texts/382"
+          "$ref": "#/texts/381"
         }
       ],
       "content_layer": "body",
@@ -1554,6 +1551,9 @@
         "$ref": "#/texts/341"
       },
       "children": [
+        {
+          "$ref": "#/texts/382"
+        },
         {
           "$ref": "#/texts/383"
         },
@@ -1577,9 +1577,6 @@
         },
         {
           "$ref": "#/texts/390"
-        },
-        {
-          "$ref": "#/texts/391"
         }
       ],
       "content_layer": "body",
@@ -1591,14 +1588,7 @@
       "parent": {
         "$ref": "#/texts/341"
       },
-      "children": [
-        {
-          "$ref": "#/texts/392"
-        },
-        {
-          "$ref": "#/texts/393"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
       "name": "list",
       "label": "list"
@@ -6774,27 +6764,13 @@
       "content_layer": "body",
       "label": "list_item",
       "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/351",
-      "parent": {
-        "$ref": "#/groups/42"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
       "orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
       "text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
       "enumerated": false,
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/352",
+      "self_ref": "#/texts/351",
       "parent": {
         "$ref": "#/groups/43"
       },
@@ -6808,7 +6784,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/353",
+      "self_ref": "#/texts/352",
       "parent": {
         "$ref": "#/groups/43"
       },
@@ -6822,7 +6798,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/354",
+      "self_ref": "#/texts/353",
       "parent": {
         "$ref": "#/groups/43"
       },
@@ -6836,7 +6812,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/355",
+      "self_ref": "#/texts/354",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6850,7 +6826,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/356",
+      "self_ref": "#/texts/355",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6864,7 +6840,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/357",
+      "self_ref": "#/texts/356",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6878,7 +6854,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/358",
+      "self_ref": "#/texts/357",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6892,7 +6868,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/359",
+      "self_ref": "#/texts/358",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6906,7 +6882,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/360",
+      "self_ref": "#/texts/359",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6920,7 +6896,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/361",
+      "self_ref": "#/texts/360",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6934,7 +6910,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/362",
+      "self_ref": "#/texts/361",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6948,7 +6924,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/363",
+      "self_ref": "#/texts/362",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6962,7 +6938,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/364",
+      "self_ref": "#/texts/363",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6976,7 +6952,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/365",
+      "self_ref": "#/texts/364",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -6990,7 +6966,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/366",
+      "self_ref": "#/texts/365",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7004,7 +6980,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/367",
+      "self_ref": "#/texts/366",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7018,7 +6994,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/368",
+      "self_ref": "#/texts/367",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7032,7 +7008,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/369",
+      "self_ref": "#/texts/368",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7046,7 +7022,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/370",
+      "self_ref": "#/texts/369",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7060,7 +7036,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/371",
+      "self_ref": "#/texts/370",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7074,7 +7050,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/372",
+      "self_ref": "#/texts/371",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7088,7 +7064,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/373",
+      "self_ref": "#/texts/372",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7102,7 +7078,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/374",
+      "self_ref": "#/texts/373",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7116,7 +7092,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/375",
+      "self_ref": "#/texts/374",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7130,7 +7106,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/376",
+      "self_ref": "#/texts/375",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7144,7 +7120,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/377",
+      "self_ref": "#/texts/376",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7158,7 +7134,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/378",
+      "self_ref": "#/texts/377",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7172,7 +7148,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/379",
+      "self_ref": "#/texts/378",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7186,7 +7162,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/380",
+      "self_ref": "#/texts/379",
       "parent": {
         "$ref": "#/groups/44"
       },
@@ -7200,7 +7176,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/381",
+      "self_ref": "#/texts/380",
       "parent": {
         "$ref": "#/groups/45"
       },
@@ -7214,7 +7190,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/382",
+      "self_ref": "#/texts/381",
       "parent": {
         "$ref": "#/groups/45"
       },
@@ -7228,7 +7204,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/383",
+      "self_ref": "#/texts/382",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7242,7 +7218,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/384",
+      "self_ref": "#/texts/383",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7256,7 +7232,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/385",
+      "self_ref": "#/texts/384",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7270,7 +7246,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/386",
+      "self_ref": "#/texts/385",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7284,7 +7260,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/387",
+      "self_ref": "#/texts/386",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7298,7 +7274,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/388",
+      "self_ref": "#/texts/387",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7312,7 +7288,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/389",
+      "self_ref": "#/texts/388",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7326,7 +7302,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/390",
+      "self_ref": "#/texts/389",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7340,7 +7316,7 @@
       "marker": "-"
     },
     {
-      "self_ref": "#/texts/391",
+      "self_ref": "#/texts/390",
       "parent": {
         "$ref": "#/groups/46"
       },
@@ -7352,34 +7328,6 @@
       "text": "Mobile view",
       "enumerated": false,
       "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/392",
-      "parent": {
-        "$ref": "#/groups/47"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/393",
-      "parent": {
-        "$ref": "#/groups/47"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "",
-      "text": "",
-      "enumerated": false,
-      "marker": "-"
     }
   ],
   "pictures": [
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
index df4554f..bd3f3c3 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
@@ -473,7 +473,6 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a
 
 - list of books (useful looking abstracts)
 - Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
-- 
 - Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl
 
 | Authority control databases    | Authority control databases                  |
@@ -526,7 +525,4 @@ additional terms may apply. By using this site, you agree to the Terms of Use an
 - Developers
 - Statistics
 - Cookie statement
-- Mobile view
-
-- 
--
\ No newline at end of file
+- Mobile view
\ No newline at end of file