From 7450050acea99d81f7ef0cef2725573658099e54 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Tue, 18 Feb 2025 11:30:47 +0100
Subject: [PATCH] refactor: upgrade BeautifulSoup4 with type hints (#999)
* refactor: upgrade BeautifulSoup4 with type hints
Upgrade dependency library BeautifulSoup4 to 4.13.3 (with type hints).
Refactor backends using BeautifulSoup4 to comply with type hints.
Apply style simplifications and improvements for consistency.
Remove variables and functions that are never used.
Remove code duplication between backends for parsing HTML tables.
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
* build: allow beautifulsoup4 version 4.12.3
Allow older version of beautifulsoup4 and ensure compatibility.
Update library dependencies.
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---------
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
docling/backend/html_backend.py | 273 +++++++++---------
docling/backend/xml/jats_backend.py | 74 +----
docling/backend/xml/uspto_backend.py | 75 +++--
poetry.lock | 41 +--
pyproject.toml | 3 +-
.../docling_v2/wiki_duck.html.itxt | 125 ++++----
.../docling_v2/wiki_duck.html.json | 156 ++++------
.../groundtruth/docling_v2/wiki_duck.html.md | 6 +-
8 files changed, 328 insertions(+), 425 deletions(-)
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 286dfbf..234e5da 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,9 +1,9 @@
import logging
from io import BytesIO
from pathlib import Path
-from typing import Optional, Set, Union
+from typing import Optional, Union, cast
-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
@@ -12,6 +12,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
+from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
@@ -21,6 +22,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
+ @override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
@@ -48,13 +50,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
f"Could not initialize HTML backend for file with hash {self.document_hash}."
) from e
+ @override
def is_valid(self) -> bool:
return self.soup is not None
@classmethod
+ @override
def supports_pagination(cls) -> bool:
return False
+ @override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
@@ -62,9 +67,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.path_or_stream = None
@classmethod
- def supported_formats(cls) -> Set[InputFormat]:
+ @override
+ def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.HTML}
+ @override
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
origin = DocumentOrigin(
@@ -80,98 +87,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
assert self.soup is not None
content = self.soup.body or self.soup
# Replace
tags with newline characters
- for br in content.find_all("br"):
- br.replace_with("\n")
- doc = self.walk(content, doc)
+ for br in content("br"):
+ br.replace_with(NavigableString("\n"))
+ self.walk(content, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
return doc
- def walk(self, element: Tag, doc: DoclingDocument):
- try:
- # Iterate over elements in the body of the document
- for idx, element in enumerate(element.children):
+ def walk(self, tag: Tag, doc: DoclingDocument) -> None:
+ # Iterate over elements in the body of the document
+ for element in tag.children:
+ if isinstance(element, Tag):
try:
- self.analyse_element(element, idx, doc)
+ self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child:
-
- _log.error(" -> error treating child: ", exc_child)
- _log.error(" => element: ", element, "\n")
+ _log.error(
+ f"Error processing child from tag{tag.name}: {exc_child}"
+ )
raise exc_child
- except Exception as exc:
- pass
+ return
- return doc
-
- def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
- """
- if element.name!=None:
- _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
- """
-
- if element.name in self.labels:
- self.labels[element.name] += 1
+ def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
+ if tag.name in self.labels:
+ self.labels[tag.name] += 1
else:
- self.labels[element.name] = 1
+ self.labels[tag.name] = 1
- if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
- self.handle_header(element, idx, doc)
- elif element.name in ["p"]:
- self.handle_paragraph(element, idx, doc)
- elif element.name in ["pre"]:
- self.handle_code(element, idx, doc)
- elif element.name in ["ul", "ol"]:
- self.handle_list(element, idx, doc)
- elif element.name in ["li"]:
- self.handle_listitem(element, idx, doc)
- elif element.name == "table":
- self.handle_table(element, idx, doc)
- elif element.name == "figure":
- self.handle_figure(element, idx, doc)
- elif element.name == "img":
- self.handle_image(element, idx, doc)
+ if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+ self.handle_header(tag, doc)
+ elif tag.name in ["p"]:
+ self.handle_paragraph(tag, doc)
+ elif tag.name in ["pre"]:
+ self.handle_code(tag, doc)
+ elif tag.name in ["ul", "ol"]:
+ self.handle_list(tag, doc)
+ elif tag.name in ["li"]:
+ self.handle_list_item(tag, doc)
+ elif tag.name == "table":
+ self.handle_table(tag, doc)
+ elif tag.name == "figure":
+ self.handle_figure(tag, doc)
+ elif tag.name == "img":
+ self.handle_image(doc)
else:
- self.walk(element, doc)
+ self.walk(tag, doc)
- def get_direct_text(self, item: Tag):
- """Get the direct text of the
element (ignoring nested lists)."""
- text = item.find(string=True, recursive=False)
- if isinstance(text, str):
- return text.strip()
+ def get_text(self, item: PageElement) -> str:
+ """Get the text content of a tag."""
+ parts: list[str] = self.extract_text_recursively(item)
- return ""
+ return "".join(parts) + " "
# Function to recursively extract text from all child nodes
- def extract_text_recursively(self, item: Tag):
- result = []
+ def extract_text_recursively(self, item: PageElement) -> list[str]:
+ result: list[str] = []
- if isinstance(item, str):
+ if isinstance(item, NavigableString):
return [item]
- if item.name not in ["ul", "ol"]:
- try:
- # Iterate over the children (and their text and tails)
- for child in item:
- try:
- # Recursively get the child's text content
- result.extend(self.extract_text_recursively(child))
- except:
- pass
- except:
- _log.warn("item has no children")
- pass
+ tag = cast(Tag, item)
+ if tag.name not in ["ul", "ol"]:
+ for child in tag:
+ # Recursively get the child's text content
+ result.extend(self.extract_text_recursively(child))
- return "".join(result) + " "
+ return ["".join(result) + " "]
- def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", ""))
- slevel = hlevel - 1
-
- label = DocItemLabel.SECTION_HEADER
text = element.text.strip()
if hlevel == 1:
@@ -197,7 +184,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif hlevel < self.level:
# remove the tail
- for key, val in self.parents.items():
+ for key in self.parents.keys():
if key > hlevel:
self.parents[key] = None
self.level = hlevel
@@ -208,27 +195,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
level=hlevel,
)
- def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles monospace code snippets (pre)."""
if element.text is None:
return
text = element.text.strip()
- label = DocItemLabel.CODE
- if len(text) == 0:
- return
- doc.add_code(parent=self.parents[self.level], text=text)
+ if text:
+ doc.add_code(parent=self.parents[self.level], text=text)
- def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.PARAGRAPH
- if len(text) == 0:
- return
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
+ if text:
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
- def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""
if element.name == "ul":
@@ -250,18 +234,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = None
self.level -= 1
- def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles listitem tags (li)."""
- nested_lists = element.find(["ul", "ol"])
+ nested_list = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1
- if nested_lists:
- name = element.name
+ if nested_list:
# Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively
- text = self.extract_text_recursively(element)
+ text: str = self.get_text(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
@@ -287,7 +270,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = None
self.level -= 1
- elif isinstance(element.text, str):
+ elif element.text.strip():
text = element.text.strip()
marker = ""
@@ -302,59 +285,79 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level],
)
else:
- _log.warn("list-item has no text: ", element)
-
- def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
- """Handles table tags."""
+ _log.warning(f"list-item has no text: {element}")
+ @staticmethod
+ def parse_table_data(element: Tag) -> Optional[TableData]:
nested_tables = element.find("table")
if nested_tables is not None:
- _log.warn("detected nested tables: skipping for now")
- return
+ _log.warning("Skipping nested table.")
+ return None
# Count the number of rows (number of elements)
- num_rows = len(element.find_all("tr"))
+ num_rows = len(element("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
- for row in element.find_all("tr"):
+ for row in element("tr"):
col_count = 0
- for cell in row.find_all(["td", "th"]):
- colspan = int(cell.get("colspan", 1))
+ if not isinstance(row, Tag):
+ continue
+ for cell in row(["td", "th"]):
+ if not isinstance(row, Tag):
+ continue
+ val = cast(Tag, cell).get("colspan", "1")
+ colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
col_count += colspan
num_cols = max(num_cols, col_count)
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
- for row_idx, row in enumerate(element.find_all("tr")):
+ for row_idx, row in enumerate(element("tr")):
+ if not isinstance(row, Tag):
+ continue
# For each row, find all the column cells (both and | )
- cells = row.find_all(["td", "th"])
+ cells = row(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
- for j, html_cell in enumerate(cells):
- if html_cell.name == "td":
+ for html_cell in cells:
+ if isinstance(html_cell, Tag) and html_cell.name == "td":
col_header = False
+ # Extract the text content of each cell
col_idx = 0
- # Extract and print the text content of each cell
- for _, html_cell in enumerate(cells):
+ for html_cell in cells:
+ if not isinstance(html_cell, Tag):
+ continue
+ # extract inline formulas
+ for formula in html_cell("inline-formula"):
+ math_parts = formula.text.split("$$")
+ if len(math_parts) == 3:
+ math_formula = f"$${math_parts[1]}$$"
+ formula.replace_with(NavigableString(math_formula))
+
+ # TODO: extract content correctly from table-cells with lists
text = html_cell.text
- try:
- text = self.extract_table_cell_text(html_cell)
- except Exception as exc:
- _log.warn("exception: ", exc)
- exit(-1)
# label = html_cell.name
-
- col_span = int(html_cell.get("colspan", 1))
- row_span = int(html_cell.get("rowspan", 1))
+ col_val = html_cell.get("colspan", "1")
+ col_span = (
+ int(col_val)
+ if isinstance(col_val, str) and col_val.isnumeric()
+ else 1
+ )
+ row_val = html_cell.get("rowspan", "1")
+ row_span = (
+ int(row_val)
+ if isinstance(row_val, str) and row_val.isnumeric()
+ else 1
+ )
while grid[row_idx][col_idx] is not None:
col_idx += 1
@@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
- cell = TableCell(
+ table_cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
@@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
- data.table_cells.append(cell)
+ data.table_cells.append(table_cell)
- doc.add_table(data=data, parent=self.parents[self.level])
+ return data
- def get_list_text(self, list_element: Tag, level=0):
+ def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
+ """Handles table tags."""
+
+ table_data = HTMLDocumentBackend.parse_table_data(element)
+
+ if table_data is not None:
+ doc.add_table(data=table_data, parent=self.parents[self.level])
+
+ def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from or with proper indentation."""
result = []
bullet_char = "*" # Default bullet character for unordered lists
if list_element.name == "ol": # For ordered lists, use numbers
- for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
+ for i, li in enumerate(list_element("li", recursive=False), 1):
+ if not isinstance(li, Tag):
+ continue
# Add numbering for ordered lists
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
# Handle nested lists
nested_list = li.find(["ul", "ol"])
- if nested_list:
+ if isinstance(nested_list, Tag):
result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points
- for li in list_element.find_all("li", recursive=False):
+ for li in list_element("li", recursive=False):
+ if not isinstance(li, Tag):
+ continue
# Add bullet points for unordered lists
result.append(
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
)
# Handle nested lists
nested_list = li.find(["ul", "ol"])
- if nested_list:
+ if isinstance(nested_list, Tag):
result.extend(self.get_list_text(nested_list, level + 1))
return result
- def extract_table_cell_text(self, cell: Tag):
- """Extract text from a table cell, including lists with indents."""
- contains_lists = cell.find(["ul", "ol"])
- if contains_lists is None:
- return cell.text
- else:
- _log.debug(
- "should extract the content correctly for table-cells with lists ..."
- )
- return cell.text
-
- def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
# Extract the image URI from the tag
# image_uri = root.xpath('//figure//img/@src')[0]
contains_captions = element.find(["figcaption"])
- if contains_captions is None:
+ if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None)
-
else:
texts = []
for item in contains_captions:
@@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
caption=fig_caption,
)
- def handle_image(self, element: Tag, idx, doc: DoclingDocument):
+ def handle_image(self, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)
diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py
index 1d7091c..2409961 100755
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@@ -4,7 +4,7 @@ from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
@@ -12,14 +12,13 @@ from docling_core.types.doc import (
GroupItem,
GroupLabel,
NodeItem,
- TableCell,
- TableData,
TextItem,
)
from lxml import etree
from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
) -> None:
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
table_tag = soup.find("table")
-
- nested_tables = table_tag.find("table")
- if nested_tables:
- _log.warning(f"Skipping nested table in {str(self.file)}")
+ if not isinstance(table_tag, Tag):
return
- # Count the number of rows (number of elements)
- num_rows = len(table_tag.find_all("tr"))
-
- # Find the number of columns (taking into account colspan)
- num_cols = 0
- for row in table_tag.find_all("tr"):
- col_count = 0
- for cell in row.find_all(["td", "th"]):
- colspan = int(cell.get("colspan", 1))
- col_count += colspan
- num_cols = max(num_cols, col_count)
-
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-
- # Iterate over the rows in the table
- for row_idx, row in enumerate(table_tag.find_all("tr")):
- # For each row, find all the column cells (both and | )
- cells = row.find_all(["td", "th"])
-
- # Check if each cell in the row is a header -> means it is a column header
- col_header = True
- for j, html_cell in enumerate(cells):
- if html_cell.name == "td":
- col_header = False
-
- # Extract and print the text content of each cell
- col_idx = 0
- for _, html_cell in enumerate(cells):
- # extract inline formulas
- for formula in html_cell.find_all("inline-formula"):
- math_parts = formula.text.split("$$")
- if len(math_parts) == 3:
- math_formula = f"$${math_parts[1]}$$"
- formula.replaceWith(math_formula)
- text = html_cell.text
-
- col_span = int(html_cell.get("colspan", 1))
- row_span = int(html_cell.get("rowspan", 1))
-
- while grid[row_idx][col_idx] is not None:
- col_idx += 1
- for r in range(row_span):
- for c in range(col_span):
- grid[row_idx + r][col_idx + c] = text
-
- cell = TableCell(
- text=text,
- row_span=row_span,
- col_span=col_span,
- start_row_offset_idx=row_idx,
- end_row_offset_idx=row_idx + row_span,
- start_col_offset_idx=col_idx,
- end_col_offset_idx=col_idx + col_span,
- col_header=col_header,
- row_header=((not col_header) and html_cell.name == "th"),
- )
- data.table_cells.append(cell)
+ data = HTMLDocumentBackend.parse_table_data(table_tag)
# TODO: format label vs caption once styling is supported
label = table_xml_component["label"]
@@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
else None
)
- doc.add_table(data=data, parent=parent, caption=table_caption)
+ if data is not None:
+ doc.add_table(data=data, parent=parent, caption=table_caption)
return
@@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
def _walk_linear(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str:
- # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
skip_tags = ["term"]
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
new_parent: NodeItem = parent
diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py
index 21001ab..cf23e04 100644
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
from enum import Enum, unique
from io import BytesIO
from pathlib import Path
-from typing import Any, Final, Optional, Union
+from typing import Final, Optional, Union
from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
http://oasis-open.org/specs/soextblx.dtd
"""
+ class ColInfo(TypedDict):
+ ncols: int
+ colinfo: list[dict]
+
class MinColInfoType(TypedDict):
offset: list[int]
colwidth: list[int]
@@ -1425,7 +1429,7 @@ class XmlTable:
self.empty_text = ""
self._soup = BeautifulSoup(input, features="xml")
- def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
+ def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
"""Create a unified range along the table groups.
Args:
@@ -1532,19 +1536,26 @@ class XmlTable:
Returns:
A docling table object.
"""
- tgs_align = []
- tg_secs = table.find_all("tgroup")
+ tgs_align: list[XmlTable.ColInfo] = []
+ tg_secs = table("tgroup")
if tg_secs:
for tg_sec in tg_secs:
- ncols = tg_sec.get("cols", None)
- if ncols:
- ncols = int(ncols)
- tg_align = {"ncols": ncols, "colinfo": []}
- cs_secs = tg_sec.find_all("colspec")
+ if not isinstance(tg_sec, Tag):
+ continue
+ col_val = tg_sec.get("cols")
+ ncols = (
+ int(col_val)
+ if isinstance(col_val, str) and col_val.isnumeric()
+ else 1
+ )
+ tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
+ cs_secs = tg_sec("colspec")
if cs_secs:
for cs_sec in cs_secs:
- colname = cs_sec.get("colname", None)
- colwidth = cs_sec.get("colwidth", None)
+ if not isinstance(cs_sec, Tag):
+ continue
+ colname = cs_sec.get("colname")
+ colwidth = cs_sec.get("colwidth")
tg_align["colinfo"].append(
{"colname": colname, "colwidth": colwidth}
)
@@ -1565,16 +1576,23 @@ class XmlTable:
table_data: list[TableCell] = []
i_row_global = 0
is_row_empty: bool = True
- tg_secs = table.find_all("tgroup")
+ tg_secs = table("tgroup")
if tg_secs:
for itg, tg_sec in enumerate(tg_secs):
+ if not isinstance(tg_sec, Tag):
+ continue
tg_range = tgs_range[itg]
- row_secs = tg_sec.find_all(["row", "tr"])
+ row_secs = tg_sec(["row", "tr"])
if row_secs:
for row_sec in row_secs:
- entry_secs = row_sec.find_all(["entry", "td"])
- is_header: bool = row_sec.parent.name in ["thead"]
+ if not isinstance(row_sec, Tag):
+ continue
+ entry_secs = row_sec(["entry", "td"])
+ is_header: bool = (
+ row_sec.parent is not None
+ and row_sec.parent.name == "thead"
+ )
ncols = 0
local_row: list[TableCell] = []
@@ -1582,23 +1600,26 @@ class XmlTable:
if entry_secs:
wrong_nbr_cols = False
for ientry, entry_sec in enumerate(entry_secs):
+ if not isinstance(entry_sec, Tag):
+ continue
text = entry_sec.get_text().strip()
# start-end
- namest = entry_sec.attrs.get("namest", None)
- nameend = entry_sec.attrs.get("nameend", None)
- if isinstance(namest, str) and namest.isnumeric():
- namest = int(namest)
- else:
- namest = ientry + 1
+ namest = entry_sec.get("namest")
+ nameend = entry_sec.get("nameend")
+ start = (
+ int(namest)
+ if isinstance(namest, str) and namest.isnumeric()
+ else ientry + 1
+ )
if isinstance(nameend, str) and nameend.isnumeric():
- nameend = int(nameend)
+ end = int(nameend)
shift = 0
else:
- nameend = ientry + 2
+ end = ientry + 2
shift = 1
- if nameend > len(tg_range["cell_offst"]):
+ if end > len(tg_range["cell_offst"]):
wrong_nbr_cols = True
self.nbr_messages += 1
if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ class XmlTable:
break
range_ = [
- tg_range["cell_offst"][namest - 1],
- tg_range["cell_offst"][nameend - 1] - shift,
+ tg_range["cell_offst"][start - 1],
+ tg_range["cell_offst"][end - 1] - shift,
]
# add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ class XmlTable:
A docling table data.
"""
section = self._soup.find("table")
- if section is not None:
+ if isinstance(section, Tag):
table = self._parse_table(section)
if table.num_rows == 0 or table.num_cols == 0:
_log.warning("The parsed USPTO table is empty")
diff --git a/poetry.lock b/poetry.lock
index f1887d7..329e4ae 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -282,17 +282,18 @@ testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-ch
[[package]]
name = "beautifulsoup4"
-version = "4.12.3"
+version = "4.13.3"
description = "Screen-scraping library"
optional = false
-python-versions = ">=3.6.0"
+python-versions = ">=3.7.0"
files = [
- {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
- {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+ {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
+ {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
]
[package.dependencies]
soupsieve = ">1.2"
+typing-extensions = ">=4.0.0"
[package.extras]
cchardet = ["cchardet"]
@@ -866,13 +867,13 @@ files = [
[[package]]
name = "docling-core"
-version = "2.19.0"
+version = "2.19.1"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
- {file = "docling_core-2.19.0-py3-none-any.whl", hash = "sha256:caa1e13d98fa9a00608091c386609c75b3560c7291e842c252f0b6f8d5812dbd"},
- {file = "docling_core-2.19.0.tar.gz", hash = "sha256:ebf3062e31155bb5f0e6132056a2d239a0e6e693a75c5758886909bb9fef461a"},
+ {file = "docling_core-2.19.1-py3-none-any.whl", hash = "sha256:ca7bd4dacd75611c5ea4f205192b71a8f22205e615eff1a16aac7082644d3b2e"},
+ {file = "docling_core-2.19.1.tar.gz", hash = "sha256:e2769b816c669cdf27024dd3b219d3ecaf2161691dd5e8e5e8ce439557ea0928"},
]
[package.dependencies]
@@ -1357,13 +1358,13 @@ colorama = ">=0.4"
[[package]]
name = "griffe-pydantic"
-version = "1.1.0"
+version = "1.1.2"
description = "Griffe extension for Pydantic."
optional = false
python-versions = ">=3.9"
files = [
- {file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"},
- {file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"},
+ {file = "griffe_pydantic-1.1.2-py3-none-any.whl", hash = "sha256:8ad53218ca6e9c24ccec83588eb435f562b30355f641fe336e81b1e00ea05f3c"},
+ {file = "griffe_pydantic-1.1.2.tar.gz", hash = "sha256:381eacd8854a85811522b4f6dc9a1ef0fb5931825081379d70ff3a425b0d4ea1"},
]
[package.dependencies]
@@ -7052,18 +7053,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
[[package]]
name = "transformers"
-version = "4.48.3"
+version = "4.49.0"
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
optional = false
python-versions = ">=3.9.0"
files = [
- {file = "transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36"},
- {file = "transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb"},
+ {file = "transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03"},
+ {file = "transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e"},
]
[package.dependencies]
filelock = "*"
-huggingface-hub = ">=0.24.0,<1.0"
+huggingface-hub = ">=0.26.0,<1.0"
numpy = ">=1.17"
packaging = ">=20.0"
pyyaml = ">=5.1"
@@ -7076,13 +7077,13 @@ tqdm = ">=4.27"
[package.extras]
accelerate = ["accelerate (>=0.26.0)"]
agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
benchmark = ["optimum-benchmark (>=0.3.0)"]
codecarbon = ["codecarbon (>=2.8.1)"]
deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
@@ -7115,8 +7116,8 @@ tokenizers = ["tokenizers (>=0.21,<0.22)"]
torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)"]
+torchhub = ["filelock", "huggingface-hub (>=0.26.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
+video = ["av"]
vision = ["Pillow (>=10.0.1,<=15.0)"]
[[package]]
@@ -7841,4 +7842,4 @@ vlm = ["transformers", "transformers"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
-content-hash = "2cca8bac31dd535e36045cf2f5f0380852c34f6bafad78834144d6ca56d2d79c"
+content-hash = "63f9271160d39cac74fa3fc959dbb0f91530d76a693c69d81ced006477d04315"
diff --git a/pyproject.toml b/pyproject.toml
index 6b61da8..0c04acf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ scipy = [
typer = "^0.12.5"
python-docx = "^1.1.2"
python-pptx = "^1.0.2"
-beautifulsoup4 = ">=4.12.3,<4.13.0"
+beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4"
marko = "^2.1.2"
openpyxl = "^3.1.5"
@@ -166,7 +166,6 @@ module = [
"ocrmac.*",
"deepsearch_glm.*",
"lxml.*",
- "bs4.*",
"huggingface_hub.*",
"transformers.*",
]
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
index 2d4a316..3ae39e8 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
@@ -410,68 +410,65 @@ item-0 at level 0: unspecified: group _root_
item-396 at level 3: list: group list
item-397 at level 4: list_item: list of books (useful looking abstracts)
item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
- item-399 at level 4: list_item:
- item-400 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
- item-401 at level 3: table with [3x2]
- item-402 at level 3: picture
- item-403 at level 3: list: group list
- item-404 at level 4: list_item: Ducks
- item-405 at level 4: list_item: Game birds
- item-406 at level 4: list_item: Bird common names
- item-407 at level 3: list: group list
- item-408 at level 4: list_item: All accuracy disputes
- item-409 at level 4: list_item: Accuracy disputes from February 2020
- item-410 at level 4: list_item: CS1 Finnish-language sources (fi)
- item-411 at level 4: list_item: CS1 Latvian-language sources (lv)
- item-412 at level 4: list_item: CS1 Swedish-language sources (sv)
- item-413 at level 4: list_item: Articles with short description
- item-414 at level 4: list_item: Short description is different from Wikidata
- item-415 at level 4: list_item: Wikipedia indefinitely move-protected pages
- item-416 at level 4: list_item: Wikipedia indefinitely semi-protected pages
- item-417 at level 4: list_item: Articles with 'species' microformats
- item-418 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
- item-419 at level 4: list_item: Articles containing Dutch-language text
- item-420 at level 4: list_item: Articles containing German-language text
- item-421 at level 4: list_item: Articles containing Norwegian-language text
- item-422 at level 4: list_item: Articles containing Lithuanian-language text
- item-423 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
- item-424 at level 4: list_item: All articles with self-published sources
- item-425 at level 4: list_item: Articles with self-published sources from February 2020
- item-426 at level 4: list_item: All articles with unsourced statements
- item-427 at level 4: list_item: Articles with unsourced statements from January 2022
- item-428 at level 4: list_item: CS1: long volume value
- item-429 at level 4: list_item: Pages using Sister project links with wikidata mismatch
- item-430 at level 4: list_item: Pages using Sister project links with hidden wikidata
- item-431 at level 4: list_item: Webarchive template wayback links
- item-432 at level 4: list_item: Articles with Project Gutenberg links
- item-433 at level 4: list_item: Articles containing video clips
- item-434 at level 3: list: group list
- item-435 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
- item-436 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
- item-437 at level 3: list: group list
- item-438 at level 4: list_item: Privacy policy
- item-439 at level 4: list_item: About Wikipedia
- item-440 at level 4: list_item: Disclaimers
- item-441 at level 4: list_item: Contact Wikipedia
- item-442 at level 4: list_item: Code of Conduct
- item-443 at level 4: list_item: Developers
- item-444 at level 4: list_item: Statistics
- item-445 at level 4: list_item: Cookie statement
- item-446 at level 4: list_item: Mobile view
+ item-399 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
+ item-400 at level 3: table with [3x2]
+ item-401 at level 3: picture
+ item-402 at level 3: list: group list
+ item-403 at level 4: list_item: Ducks
+ item-404 at level 4: list_item: Game birds
+ item-405 at level 4: list_item: Bird common names
+ item-406 at level 3: list: group list
+ item-407 at level 4: list_item: All accuracy disputes
+ item-408 at level 4: list_item: Accuracy disputes from February 2020
+ item-409 at level 4: list_item: CS1 Finnish-language sources (fi)
+ item-410 at level 4: list_item: CS1 Latvian-language sources (lv)
+ item-411 at level 4: list_item: CS1 Swedish-language sources (sv)
+ item-412 at level 4: list_item: Articles with short description
+ item-413 at level 4: list_item: Short description is different from Wikidata
+ item-414 at level 4: list_item: Wikipedia indefinitely move-protected pages
+ item-415 at level 4: list_item: Wikipedia indefinitely semi-protected pages
+ item-416 at level 4: list_item: Articles with 'species' microformats
+ item-417 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
+ item-418 at level 4: list_item: Articles containing Dutch-language text
+ item-419 at level 4: list_item: Articles containing German-language text
+ item-420 at level 4: list_item: Articles containing Norwegian-language text
+ item-421 at level 4: list_item: Articles containing Lithuanian-language text
+ item-422 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
+ item-423 at level 4: list_item: All articles with self-published sources
+ item-424 at level 4: list_item: Articles with self-published sources from February 2020
+ item-425 at level 4: list_item: All articles with unsourced statements
+ item-426 at level 4: list_item: Articles with unsourced statements from January 2022
+ item-427 at level 4: list_item: CS1: long volume value
+ item-428 at level 4: list_item: Pages using Sister project links with wikidata mismatch
+ item-429 at level 4: list_item: Pages using Sister project links with hidden wikidata
+ item-430 at level 4: list_item: Webarchive template wayback links
+ item-431 at level 4: list_item: Articles with Project Gutenberg links
+ item-432 at level 4: list_item: Articles containing video clips
+ item-433 at level 3: list: group list
+ item-434 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
+ item-435 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
+ item-436 at level 3: list: group list
+ item-437 at level 4: list_item: Privacy policy
+ item-438 at level 4: list_item: About Wikipedia
+ item-439 at level 4: list_item: Disclaimers
+ item-440 at level 4: list_item: Contact Wikipedia
+ item-441 at level 4: list_item: Code of Conduct
+ item-442 at level 4: list_item: Developers
+ item-443 at level 4: list_item: Statistics
+ item-444 at level 4: list_item: Cookie statement
+ item-445 at level 4: list_item: Mobile view
+ item-446 at level 3: list: group list
item-447 at level 3: list: group list
- item-448 at level 4: list_item:
- item-449 at level 4: list_item:
- item-450 at level 3: list: group list
- item-451 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
- item-452 at level 1: caption: Male mallard.
- item-453 at level 1: caption: Wood ducks.
- item-454 at level 1: caption: Mallard landing in approach
- item-455 at level 1: caption: Male Mandarin duck
- item-456 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
- item-457 at level 1: caption: Female mallard in Cornwall, England
- item-458 at level 1: caption: Pecten along the bill
- item-459 at level 1: caption: Mallard duckling preening
- item-460 at level 1: caption: A Muscovy duckling
- item-461 at level 1: caption: Ringed teal
- item-462 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
- item-463 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
\ No newline at end of file
+ item-448 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
+ item-449 at level 1: caption: Male mallard.
+ item-450 at level 1: caption: Wood ducks.
+ item-451 at level 1: caption: Mallard landing in approach
+ item-452 at level 1: caption: Male Mandarin duck
+ item-453 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
+ item-454 at level 1: caption: Female mallard in Cornwall, England
+ item-455 at level 1: caption: Pecten along the bill
+ item-456 at level 1: caption: Mallard duckling preening
+ item-457 at level 1: caption: A Muscovy duckling
+ item-458 at level 1: caption: Ringed teal
+ item-459 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
+ item-460 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
index 196c903..e59c18f 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
@@ -1413,9 +1413,6 @@
},
{
"$ref": "#/texts/350"
- },
- {
- "$ref": "#/texts/351"
}
],
"content_layer": "body",
@@ -1428,14 +1425,14 @@
"$ref": "#/texts/341"
},
"children": [
+ {
+ "$ref": "#/texts/351"
+ },
{
"$ref": "#/texts/352"
},
{
"$ref": "#/texts/353"
- },
- {
- "$ref": "#/texts/354"
}
],
"content_layer": "body",
@@ -1448,6 +1445,9 @@
"$ref": "#/texts/341"
},
"children": [
+ {
+ "$ref": "#/texts/354"
+ },
{
"$ref": "#/texts/355"
},
@@ -1522,9 +1522,6 @@
},
{
"$ref": "#/texts/379"
- },
- {
- "$ref": "#/texts/380"
}
],
"content_layer": "body",
@@ -1538,10 +1535,10 @@
},
"children": [
{
- "$ref": "#/texts/381"
+ "$ref": "#/texts/380"
},
{
- "$ref": "#/texts/382"
+ "$ref": "#/texts/381"
}
],
"content_layer": "body",
@@ -1554,6 +1551,9 @@
"$ref": "#/texts/341"
},
"children": [
+ {
+ "$ref": "#/texts/382"
+ },
{
"$ref": "#/texts/383"
},
@@ -1577,9 +1577,6 @@
},
{
"$ref": "#/texts/390"
- },
- {
- "$ref": "#/texts/391"
}
],
"content_layer": "body",
@@ -1591,14 +1588,7 @@
"parent": {
"$ref": "#/texts/341"
},
- "children": [
- {
- "$ref": "#/texts/392"
- },
- {
- "$ref": "#/texts/393"
- }
- ],
+ "children": [],
"content_layer": "body",
"name": "list",
"label": "list"
@@ -6774,27 +6764,13 @@
"content_layer": "body",
"label": "list_item",
"prov": [],
- "orig": "",
- "text": "",
- "enumerated": false,
- "marker": "-"
- },
- {
- "self_ref": "#/texts/351",
- "parent": {
- "$ref": "#/groups/42"
- },
- "children": [],
- "content_layer": "body",
- "label": "list_item",
- "prov": [],
"orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
"text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
"enumerated": false,
"marker": "-"
},
{
- "self_ref": "#/texts/352",
+ "self_ref": "#/texts/351",
"parent": {
"$ref": "#/groups/43"
},
@@ -6808,7 +6784,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/353",
+ "self_ref": "#/texts/352",
"parent": {
"$ref": "#/groups/43"
},
@@ -6822,7 +6798,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/354",
+ "self_ref": "#/texts/353",
"parent": {
"$ref": "#/groups/43"
},
@@ -6836,7 +6812,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/355",
+ "self_ref": "#/texts/354",
"parent": {
"$ref": "#/groups/44"
},
@@ -6850,7 +6826,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/356",
+ "self_ref": "#/texts/355",
"parent": {
"$ref": "#/groups/44"
},
@@ -6864,7 +6840,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/357",
+ "self_ref": "#/texts/356",
"parent": {
"$ref": "#/groups/44"
},
@@ -6878,7 +6854,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/358",
+ "self_ref": "#/texts/357",
"parent": {
"$ref": "#/groups/44"
},
@@ -6892,7 +6868,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/359",
+ "self_ref": "#/texts/358",
"parent": {
"$ref": "#/groups/44"
},
@@ -6906,7 +6882,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/360",
+ "self_ref": "#/texts/359",
"parent": {
"$ref": "#/groups/44"
},
@@ -6920,7 +6896,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/361",
+ "self_ref": "#/texts/360",
"parent": {
"$ref": "#/groups/44"
},
@@ -6934,7 +6910,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/362",
+ "self_ref": "#/texts/361",
"parent": {
"$ref": "#/groups/44"
},
@@ -6948,7 +6924,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/363",
+ "self_ref": "#/texts/362",
"parent": {
"$ref": "#/groups/44"
},
@@ -6962,7 +6938,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/364",
+ "self_ref": "#/texts/363",
"parent": {
"$ref": "#/groups/44"
},
@@ -6976,7 +6952,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/365",
+ "self_ref": "#/texts/364",
"parent": {
"$ref": "#/groups/44"
},
@@ -6990,7 +6966,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/366",
+ "self_ref": "#/texts/365",
"parent": {
"$ref": "#/groups/44"
},
@@ -7004,7 +6980,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/367",
+ "self_ref": "#/texts/366",
"parent": {
"$ref": "#/groups/44"
},
@@ -7018,7 +6994,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/368",
+ "self_ref": "#/texts/367",
"parent": {
"$ref": "#/groups/44"
},
@@ -7032,7 +7008,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/369",
+ "self_ref": "#/texts/368",
"parent": {
"$ref": "#/groups/44"
},
@@ -7046,7 +7022,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/370",
+ "self_ref": "#/texts/369",
"parent": {
"$ref": "#/groups/44"
},
@@ -7060,7 +7036,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/371",
+ "self_ref": "#/texts/370",
"parent": {
"$ref": "#/groups/44"
},
@@ -7074,7 +7050,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/372",
+ "self_ref": "#/texts/371",
"parent": {
"$ref": "#/groups/44"
},
@@ -7088,7 +7064,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/373",
+ "self_ref": "#/texts/372",
"parent": {
"$ref": "#/groups/44"
},
@@ -7102,7 +7078,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/374",
+ "self_ref": "#/texts/373",
"parent": {
"$ref": "#/groups/44"
},
@@ -7116,7 +7092,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/375",
+ "self_ref": "#/texts/374",
"parent": {
"$ref": "#/groups/44"
},
@@ -7130,7 +7106,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/376",
+ "self_ref": "#/texts/375",
"parent": {
"$ref": "#/groups/44"
},
@@ -7144,7 +7120,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/377",
+ "self_ref": "#/texts/376",
"parent": {
"$ref": "#/groups/44"
},
@@ -7158,7 +7134,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/378",
+ "self_ref": "#/texts/377",
"parent": {
"$ref": "#/groups/44"
},
@@ -7172,7 +7148,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/379",
+ "self_ref": "#/texts/378",
"parent": {
"$ref": "#/groups/44"
},
@@ -7186,7 +7162,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/380",
+ "self_ref": "#/texts/379",
"parent": {
"$ref": "#/groups/44"
},
@@ -7200,7 +7176,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/381",
+ "self_ref": "#/texts/380",
"parent": {
"$ref": "#/groups/45"
},
@@ -7214,7 +7190,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/382",
+ "self_ref": "#/texts/381",
"parent": {
"$ref": "#/groups/45"
},
@@ -7228,7 +7204,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/383",
+ "self_ref": "#/texts/382",
"parent": {
"$ref": "#/groups/46"
},
@@ -7242,7 +7218,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/384",
+ "self_ref": "#/texts/383",
"parent": {
"$ref": "#/groups/46"
},
@@ -7256,7 +7232,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/385",
+ "self_ref": "#/texts/384",
"parent": {
"$ref": "#/groups/46"
},
@@ -7270,7 +7246,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/386",
+ "self_ref": "#/texts/385",
"parent": {
"$ref": "#/groups/46"
},
@@ -7284,7 +7260,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/387",
+ "self_ref": "#/texts/386",
"parent": {
"$ref": "#/groups/46"
},
@@ -7298,7 +7274,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/388",
+ "self_ref": "#/texts/387",
"parent": {
"$ref": "#/groups/46"
},
@@ -7312,7 +7288,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/389",
+ "self_ref": "#/texts/388",
"parent": {
"$ref": "#/groups/46"
},
@@ -7326,7 +7302,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/390",
+ "self_ref": "#/texts/389",
"parent": {
"$ref": "#/groups/46"
},
@@ -7340,7 +7316,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/391",
+ "self_ref": "#/texts/390",
"parent": {
"$ref": "#/groups/46"
},
@@ -7352,34 +7328,6 @@
"text": "Mobile view",
"enumerated": false,
"marker": "-"
- },
- {
- "self_ref": "#/texts/392",
- "parent": {
- "$ref": "#/groups/47"
- },
- "children": [],
- "content_layer": "body",
- "label": "list_item",
- "prov": [],
- "orig": "",
- "text": "",
- "enumerated": false,
- "marker": "-"
- },
- {
- "self_ref": "#/texts/393",
- "parent": {
- "$ref": "#/groups/47"
- },
- "children": [],
- "content_layer": "body",
- "label": "list_item",
- "prov": [],
- "orig": "",
- "text": "",
- "enumerated": false,
- "marker": "-"
}
],
"pictures": [
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
index df4554f..bd3f3c3 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
@@ -473,7 +473,6 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a
- list of books (useful looking abstracts)
- Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
--
- Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl
| Authority control databases | Authority control databases |
@@ -526,7 +525,4 @@ additional terms may apply. By using this site, you agree to the Terms of Use an
- Developers
- Statistics
- Cookie statement
-- Mobile view
-
--
--
\ No newline at end of file
+- Mobile view
\ No newline at end of file
| |