diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 286dfbf..234e5da 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,9 +1,9 @@
import logging
from io import BytesIO
from pathlib import Path
-from typing import Optional, Set, Union
+from typing import Optional, Union, cast
-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
@@ -12,6 +12,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
+from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
@@ -21,6 +22,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
+ @override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
@@ -48,13 +50,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
f"Could not initialize HTML backend for file with hash {self.document_hash}."
) from e
+ @override
def is_valid(self) -> bool:
return self.soup is not None
@classmethod
+ @override
def supports_pagination(cls) -> bool:
return False
+ @override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
@@ -62,9 +67,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.path_or_stream = None
@classmethod
- def supported_formats(cls) -> Set[InputFormat]:
+ @override
+ def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.HTML}
+ @override
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
origin = DocumentOrigin(
@@ -80,98 +87,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
assert self.soup is not None
content = self.soup.body or self.soup
# Replace
tags with newline characters
- for br in content.find_all("br"):
- br.replace_with("\n")
- doc = self.walk(content, doc)
+ for br in content("br"):
+ br.replace_with(NavigableString("\n"))
+ self.walk(content, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
return doc
- def walk(self, element: Tag, doc: DoclingDocument):
- try:
- # Iterate over elements in the body of the document
- for idx, element in enumerate(element.children):
+ def walk(self, tag: Tag, doc: DoclingDocument) -> None:
+ # Iterate over elements in the body of the document
+ for element in tag.children:
+ if isinstance(element, Tag):
try:
- self.analyse_element(element, idx, doc)
+ self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child:
-
- _log.error(" -> error treating child: ", exc_child)
- _log.error(" => element: ", element, "\n")
+ _log.error(
+ f"Error processing child from tag{tag.name}: {exc_child}"
+ )
raise exc_child
- except Exception as exc:
- pass
+ return
- return doc
-
- def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
- """
- if element.name!=None:
- _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
- """
-
- if element.name in self.labels:
- self.labels[element.name] += 1
+ def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
+ if tag.name in self.labels:
+ self.labels[tag.name] += 1
else:
- self.labels[element.name] = 1
+ self.labels[tag.name] = 1
- if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
- self.handle_header(element, idx, doc)
- elif element.name in ["p"]:
- self.handle_paragraph(element, idx, doc)
- elif element.name in ["pre"]:
- self.handle_code(element, idx, doc)
- elif element.name in ["ul", "ol"]:
- self.handle_list(element, idx, doc)
- elif element.name in ["li"]:
- self.handle_listitem(element, idx, doc)
- elif element.name == "table":
- self.handle_table(element, idx, doc)
- elif element.name == "figure":
- self.handle_figure(element, idx, doc)
- elif element.name == "img":
- self.handle_image(element, idx, doc)
+ if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+ self.handle_header(tag, doc)
+ elif tag.name in ["p"]:
+ self.handle_paragraph(tag, doc)
+ elif tag.name in ["pre"]:
+ self.handle_code(tag, doc)
+ elif tag.name in ["ul", "ol"]:
+ self.handle_list(tag, doc)
+ elif tag.name in ["li"]:
+ self.handle_list_item(tag, doc)
+ elif tag.name == "table":
+ self.handle_table(tag, doc)
+ elif tag.name == "figure":
+ self.handle_figure(tag, doc)
+ elif tag.name == "img":
+ self.handle_image(doc)
else:
- self.walk(element, doc)
+ self.walk(tag, doc)
- def get_direct_text(self, item: Tag):
- """Get the direct text of the
element (ignoring nested lists)."""
- text = item.find(string=True, recursive=False)
- if isinstance(text, str):
- return text.strip()
+ def get_text(self, item: PageElement) -> str:
+ """Get the text content of a tag."""
+ parts: list[str] = self.extract_text_recursively(item)
- return ""
+ return "".join(parts) + " "
# Function to recursively extract text from all child nodes
- def extract_text_recursively(self, item: Tag):
- result = []
+ def extract_text_recursively(self, item: PageElement) -> list[str]:
+ result: list[str] = []
- if isinstance(item, str):
+ if isinstance(item, NavigableString):
return [item]
- if item.name not in ["ul", "ol"]:
- try:
- # Iterate over the children (and their text and tails)
- for child in item:
- try:
- # Recursively get the child's text content
- result.extend(self.extract_text_recursively(child))
- except:
- pass
- except:
- _log.warn("item has no children")
- pass
+ tag = cast(Tag, item)
+ if tag.name not in ["ul", "ol"]:
+ for child in tag:
+ # Recursively get the child's text content
+ result.extend(self.extract_text_recursively(child))
- return "".join(result) + " "
+ return ["".join(result) + " "]
- def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", ""))
- slevel = hlevel - 1
-
- label = DocItemLabel.SECTION_HEADER
text = element.text.strip()
if hlevel == 1:
@@ -197,7 +184,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif hlevel < self.level:
# remove the tail
- for key, val in self.parents.items():
+ for key in self.parents.keys():
if key > hlevel:
self.parents[key] = None
self.level = hlevel
@@ -208,27 +195,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
level=hlevel,
)
- def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles monospace code snippets (pre)."""
if element.text is None:
return
text = element.text.strip()
- label = DocItemLabel.CODE
- if len(text) == 0:
- return
- doc.add_code(parent=self.parents[self.level], text=text)
+ if text:
+ doc.add_code(parent=self.parents[self.level], text=text)
- def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.PARAGRAPH
- if len(text) == 0:
- return
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
+ if text:
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
- def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""
if element.name == "ul":
@@ -250,18 +234,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = None
self.level -= 1
- def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles listitem tags (li)."""
- nested_lists = element.find(["ul", "ol"])
+ nested_list = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1
- if nested_lists:
- name = element.name
+ if nested_list:
# Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively
- text = self.extract_text_recursively(element)
+ text: str = self.get_text(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
@@ -287,7 +270,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = None
self.level -= 1
- elif isinstance(element.text, str):
+ elif element.text.strip():
text = element.text.strip()
marker = ""
@@ -302,59 +285,79 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level],
)
else:
- _log.warn("list-item has no text: ", element)
-
- def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
- """Handles table tags."""
+ _log.warning(f"list-item has no text: {element}")
+ @staticmethod
+ def parse_table_data(element: Tag) -> Optional[TableData]:
nested_tables = element.find("table")
if nested_tables is not None:
- _log.warn("detected nested tables: skipping for now")
- return
+ _log.warning("Skipping nested table.")
+ return None
# Count the number of rows (number of elements)
- num_rows = len(element.find_all("tr"))
+ num_rows = len(element("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
- for row in element.find_all("tr"):
+ for row in element("tr"):
col_count = 0
- for cell in row.find_all(["td", "th"]):
- colspan = int(cell.get("colspan", 1))
+ if not isinstance(row, Tag):
+ continue
+ for cell in row(["td", "th"]):
+ if not isinstance(row, Tag):
+ continue
+ val = cast(Tag, cell).get("colspan", "1")
+ colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
col_count += colspan
num_cols = max(num_cols, col_count)
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
- for row_idx, row in enumerate(element.find_all("tr")):
+ for row_idx, row in enumerate(element("tr")):
+ if not isinstance(row, Tag):
+ continue
# For each row, find all the column cells (both and | )
- cells = row.find_all(["td", "th"])
+ cells = row(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
- for j, html_cell in enumerate(cells):
- if html_cell.name == "td":
+ for html_cell in cells:
+ if isinstance(html_cell, Tag) and html_cell.name == "td":
col_header = False
+ # Extract the text content of each cell
col_idx = 0
- # Extract and print the text content of each cell
- for _, html_cell in enumerate(cells):
+ for html_cell in cells:
+ if not isinstance(html_cell, Tag):
+ continue
+ # extract inline formulas
+ for formula in html_cell("inline-formula"):
+ math_parts = formula.text.split("$$")
+ if len(math_parts) == 3:
+ math_formula = f"$${math_parts[1]}$$"
+ formula.replace_with(NavigableString(math_formula))
+
+ # TODO: extract content correctly from table-cells with lists
text = html_cell.text
- try:
- text = self.extract_table_cell_text(html_cell)
- except Exception as exc:
- _log.warn("exception: ", exc)
- exit(-1)
# label = html_cell.name
-
- col_span = int(html_cell.get("colspan", 1))
- row_span = int(html_cell.get("rowspan", 1))
+ col_val = html_cell.get("colspan", "1")
+ col_span = (
+ int(col_val)
+ if isinstance(col_val, str) and col_val.isnumeric()
+ else 1
+ )
+ row_val = html_cell.get("rowspan", "1")
+ row_span = (
+ int(row_val)
+ if isinstance(row_val, str) and row_val.isnumeric()
+ else 1
+ )
while grid[row_idx][col_idx] is not None:
col_idx += 1
@@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
- cell = TableCell(
+ table_cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
@@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
- data.table_cells.append(cell)
+ data.table_cells.append(table_cell)
- doc.add_table(data=data, parent=self.parents[self.level])
+ return data
- def get_list_text(self, list_element: Tag, level=0):
+ def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
+ """Handles table tags."""
+
+ table_data = HTMLDocumentBackend.parse_table_data(element)
+
+ if table_data is not None:
+ doc.add_table(data=table_data, parent=self.parents[self.level])
+
+ def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from or with proper indentation."""
result = []
bullet_char = "*" # Default bullet character for unordered lists
if list_element.name == "ol": # For ordered lists, use numbers
- for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
+ for i, li in enumerate(list_element("li", recursive=False), 1):
+ if not isinstance(li, Tag):
+ continue
# Add numbering for ordered lists
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
# Handle nested lists
nested_list = li.find(["ul", "ol"])
- if nested_list:
+ if isinstance(nested_list, Tag):
result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points
- for li in list_element.find_all("li", recursive=False):
+ for li in list_element("li", recursive=False):
+ if not isinstance(li, Tag):
+ continue
# Add bullet points for unordered lists
result.append(
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
)
# Handle nested lists
nested_list = li.find(["ul", "ol"])
- if nested_list:
+ if isinstance(nested_list, Tag):
result.extend(self.get_list_text(nested_list, level + 1))
return result
- def extract_table_cell_text(self, cell: Tag):
- """Extract text from a table cell, including lists with indents."""
- contains_lists = cell.find(["ul", "ol"])
- if contains_lists is None:
- return cell.text
- else:
- _log.debug(
- "should extract the content correctly for table-cells with lists ..."
- )
- return cell.text
-
- def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
+ def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
# Extract the image URI from the tag
# image_uri = root.xpath('//figure//img/@src')[0]
contains_captions = element.find(["figcaption"])
- if contains_captions is None:
+ if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None)
-
else:
texts = []
for item in contains_captions:
@@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
caption=fig_caption,
)
- def handle_image(self, element: Tag, idx, doc: DoclingDocument):
+ def handle_image(self, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)
diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py
index 1d7091c..2409961 100755
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@@ -4,7 +4,7 @@ from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
@@ -12,14 +12,13 @@ from docling_core.types.doc import (
GroupItem,
GroupLabel,
NodeItem,
- TableCell,
- TableData,
TextItem,
)
from lxml import etree
from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
) -> None:
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
table_tag = soup.find("table")
-
- nested_tables = table_tag.find("table")
- if nested_tables:
- _log.warning(f"Skipping nested table in {str(self.file)}")
+ if not isinstance(table_tag, Tag):
return
- # Count the number of rows (number of elements)
- num_rows = len(table_tag.find_all("tr"))
-
- # Find the number of columns (taking into account colspan)
- num_cols = 0
- for row in table_tag.find_all("tr"):
- col_count = 0
- for cell in row.find_all(["td", "th"]):
- colspan = int(cell.get("colspan", 1))
- col_count += colspan
- num_cols = max(num_cols, col_count)
-
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-
- # Iterate over the rows in the table
- for row_idx, row in enumerate(table_tag.find_all("tr")):
- # For each row, find all the column cells (both and | )
- cells = row.find_all(["td", "th"])
-
- # Check if each cell in the row is a header -> means it is a column header
- col_header = True
- for j, html_cell in enumerate(cells):
- if html_cell.name == "td":
- col_header = False
-
- # Extract and print the text content of each cell
- col_idx = 0
- for _, html_cell in enumerate(cells):
- # extract inline formulas
- for formula in html_cell.find_all("inline-formula"):
- math_parts = formula.text.split("$$")
- if len(math_parts) == 3:
- math_formula = f"$${math_parts[1]}$$"
- formula.replaceWith(math_formula)
- text = html_cell.text
-
- col_span = int(html_cell.get("colspan", 1))
- row_span = int(html_cell.get("rowspan", 1))
-
- while grid[row_idx][col_idx] is not None:
- col_idx += 1
- for r in range(row_span):
- for c in range(col_span):
- grid[row_idx + r][col_idx + c] = text
-
- cell = TableCell(
- text=text,
- row_span=row_span,
- col_span=col_span,
- start_row_offset_idx=row_idx,
- end_row_offset_idx=row_idx + row_span,
- start_col_offset_idx=col_idx,
- end_col_offset_idx=col_idx + col_span,
- col_header=col_header,
- row_header=((not col_header) and html_cell.name == "th"),
- )
- data.table_cells.append(cell)
+ data = HTMLDocumentBackend.parse_table_data(table_tag)
# TODO: format label vs caption once styling is supported
label = table_xml_component["label"]
@@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
else None
)
- doc.add_table(data=data, parent=parent, caption=table_caption)
+ if data is not None:
+ doc.add_table(data=data, parent=parent, caption=table_caption)
return
@@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
def _walk_linear(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str:
- # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
skip_tags = ["term"]
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
new_parent: NodeItem = parent
diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py
index 21001ab..cf23e04 100644
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
from enum import Enum, unique
from io import BytesIO
from pathlib import Path
-from typing import Any, Final, Optional, Union
+from typing import Final, Optional, Union
from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
http://oasis-open.org/specs/soextblx.dtd
"""
+ class ColInfo(TypedDict):
+ ncols: int
+ colinfo: list[dict]
+
class MinColInfoType(TypedDict):
offset: list[int]
colwidth: list[int]
@@ -1425,7 +1429,7 @@ class XmlTable:
self.empty_text = ""
self._soup = BeautifulSoup(input, features="xml")
- def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
+ def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
"""Create a unified range along the table groups.
Args:
@@ -1532,19 +1536,26 @@ class XmlTable:
Returns:
A docling table object.
"""
- tgs_align = []
- tg_secs = table.find_all("tgroup")
+ tgs_align: list[XmlTable.ColInfo] = []
+ tg_secs = table("tgroup")
if tg_secs:
for tg_sec in tg_secs:
- ncols = tg_sec.get("cols", None)
- if ncols:
- ncols = int(ncols)
- tg_align = {"ncols": ncols, "colinfo": []}
- cs_secs = tg_sec.find_all("colspec")
+ if not isinstance(tg_sec, Tag):
+ continue
+ col_val = tg_sec.get("cols")
+ ncols = (
+ int(col_val)
+ if isinstance(col_val, str) and col_val.isnumeric()
+ else 1
+ )
+ tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
+ cs_secs = tg_sec("colspec")
if cs_secs:
for cs_sec in cs_secs:
- colname = cs_sec.get("colname", None)
- colwidth = cs_sec.get("colwidth", None)
+ if not isinstance(cs_sec, Tag):
+ continue
+ colname = cs_sec.get("colname")
+ colwidth = cs_sec.get("colwidth")
tg_align["colinfo"].append(
{"colname": colname, "colwidth": colwidth}
)
@@ -1565,16 +1576,23 @@ class XmlTable:
table_data: list[TableCell] = []
i_row_global = 0
is_row_empty: bool = True
- tg_secs = table.find_all("tgroup")
+ tg_secs = table("tgroup")
if tg_secs:
for itg, tg_sec in enumerate(tg_secs):
+ if not isinstance(tg_sec, Tag):
+ continue
tg_range = tgs_range[itg]
- row_secs = tg_sec.find_all(["row", "tr"])
+ row_secs = tg_sec(["row", "tr"])
if row_secs:
for row_sec in row_secs:
- entry_secs = row_sec.find_all(["entry", "td"])
- is_header: bool = row_sec.parent.name in ["thead"]
+ if not isinstance(row_sec, Tag):
+ continue
+ entry_secs = row_sec(["entry", "td"])
+ is_header: bool = (
+ row_sec.parent is not None
+ and row_sec.parent.name == "thead"
+ )
ncols = 0
local_row: list[TableCell] = []
@@ -1582,23 +1600,26 @@ class XmlTable:
if entry_secs:
wrong_nbr_cols = False
for ientry, entry_sec in enumerate(entry_secs):
+ if not isinstance(entry_sec, Tag):
+ continue
text = entry_sec.get_text().strip()
# start-end
- namest = entry_sec.attrs.get("namest", None)
- nameend = entry_sec.attrs.get("nameend", None)
- if isinstance(namest, str) and namest.isnumeric():
- namest = int(namest)
- else:
- namest = ientry + 1
+ namest = entry_sec.get("namest")
+ nameend = entry_sec.get("nameend")
+ start = (
+ int(namest)
+ if isinstance(namest, str) and namest.isnumeric()
+ else ientry + 1
+ )
if isinstance(nameend, str) and nameend.isnumeric():
- nameend = int(nameend)
+ end = int(nameend)
shift = 0
else:
- nameend = ientry + 2
+ end = ientry + 2
shift = 1
- if nameend > len(tg_range["cell_offst"]):
+ if end > len(tg_range["cell_offst"]):
wrong_nbr_cols = True
self.nbr_messages += 1
if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ class XmlTable:
break
range_ = [
- tg_range["cell_offst"][namest - 1],
- tg_range["cell_offst"][nameend - 1] - shift,
+ tg_range["cell_offst"][start - 1],
+ tg_range["cell_offst"][end - 1] - shift,
]
# add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ class XmlTable:
A docling table data.
"""
section = self._soup.find("table")
- if section is not None:
+ if isinstance(section, Tag):
table = self._parse_table(section)
if table.num_rows == 0 or table.num_cols == 0:
_log.warning("The parsed USPTO table is empty")
diff --git a/poetry.lock b/poetry.lock
index f1887d7..329e4ae 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -282,17 +282,18 @@ testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-ch
[[package]]
name = "beautifulsoup4"
-version = "4.12.3"
+version = "4.13.3"
description = "Screen-scraping library"
optional = false
-python-versions = ">=3.6.0"
+python-versions = ">=3.7.0"
files = [
- {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
- {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+ {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
+ {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
]
[package.dependencies]
soupsieve = ">1.2"
+typing-extensions = ">=4.0.0"
[package.extras]
cchardet = ["cchardet"]
@@ -866,13 +867,13 @@ files = [
[[package]]
name = "docling-core"
-version = "2.19.0"
+version = "2.19.1"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
- {file = "docling_core-2.19.0-py3-none-any.whl", hash = "sha256:caa1e13d98fa9a00608091c386609c75b3560c7291e842c252f0b6f8d5812dbd"},
- {file = "docling_core-2.19.0.tar.gz", hash = "sha256:ebf3062e31155bb5f0e6132056a2d239a0e6e693a75c5758886909bb9fef461a"},
+ {file = "docling_core-2.19.1-py3-none-any.whl", hash = "sha256:ca7bd4dacd75611c5ea4f205192b71a8f22205e615eff1a16aac7082644d3b2e"},
+ {file = "docling_core-2.19.1.tar.gz", hash = "sha256:e2769b816c669cdf27024dd3b219d3ecaf2161691dd5e8e5e8ce439557ea0928"},
]
[package.dependencies]
@@ -1357,13 +1358,13 @@ colorama = ">=0.4"
[[package]]
name = "griffe-pydantic"
-version = "1.1.0"
+version = "1.1.2"
description = "Griffe extension for Pydantic."
optional = false
python-versions = ">=3.9"
files = [
- {file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"},
- {file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"},
+ {file = "griffe_pydantic-1.1.2-py3-none-any.whl", hash = "sha256:8ad53218ca6e9c24ccec83588eb435f562b30355f641fe336e81b1e00ea05f3c"},
+ {file = "griffe_pydantic-1.1.2.tar.gz", hash = "sha256:381eacd8854a85811522b4f6dc9a1ef0fb5931825081379d70ff3a425b0d4ea1"},
]
[package.dependencies]
@@ -7052,18 +7053,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
[[package]]
name = "transformers"
-version = "4.48.3"
+version = "4.49.0"
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
optional = false
python-versions = ">=3.9.0"
files = [
- {file = "transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36"},
- {file = "transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb"},
+ {file = "transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03"},
+ {file = "transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e"},
]
[package.dependencies]
filelock = "*"
-huggingface-hub = ">=0.24.0,<1.0"
+huggingface-hub = ">=0.26.0,<1.0"
numpy = ">=1.17"
packaging = ">=20.0"
pyyaml = ">=5.1"
@@ -7076,13 +7077,13 @@ tqdm = ">=4.27"
[package.extras]
accelerate = ["accelerate (>=0.26.0)"]
agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
benchmark = ["optimum-benchmark (>=0.3.0)"]
codecarbon = ["codecarbon (>=2.8.1)"]
deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
@@ -7115,8 +7116,8 @@ tokenizers = ["tokenizers (>=0.21,<0.22)"]
torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)"]
+torchhub = ["filelock", "huggingface-hub (>=0.26.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
+video = ["av"]
vision = ["Pillow (>=10.0.1,<=15.0)"]
[[package]]
@@ -7841,4 +7842,4 @@ vlm = ["transformers", "transformers"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
-content-hash = "2cca8bac31dd535e36045cf2f5f0380852c34f6bafad78834144d6ca56d2d79c"
+content-hash = "63f9271160d39cac74fa3fc959dbb0f91530d76a693c69d81ced006477d04315"
diff --git a/pyproject.toml b/pyproject.toml
index 6b61da8..0c04acf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ scipy = [
typer = "^0.12.5"
python-docx = "^1.1.2"
python-pptx = "^1.0.2"
-beautifulsoup4 = ">=4.12.3,<4.13.0"
+beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4"
marko = "^2.1.2"
openpyxl = "^3.1.5"
@@ -166,7 +166,6 @@ module = [
"ocrmac.*",
"deepsearch_glm.*",
"lxml.*",
- "bs4.*",
"huggingface_hub.*",
"transformers.*",
]
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
index 2d4a316..3ae39e8 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
@@ -410,68 +410,65 @@ item-0 at level 0: unspecified: group _root_
item-396 at level 3: list: group list
item-397 at level 4: list_item: list of books (useful looking abstracts)
item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
- item-399 at level 4: list_item:
- item-400 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
- item-401 at level 3: table with [3x2]
- item-402 at level 3: picture
- item-403 at level 3: list: group list
- item-404 at level 4: list_item: Ducks
- item-405 at level 4: list_item: Game birds
- item-406 at level 4: list_item: Bird common names
- item-407 at level 3: list: group list
- item-408 at level 4: list_item: All accuracy disputes
- item-409 at level 4: list_item: Accuracy disputes from February 2020
- item-410 at level 4: list_item: CS1 Finnish-language sources (fi)
- item-411 at level 4: list_item: CS1 Latvian-language sources (lv)
- item-412 at level 4: list_item: CS1 Swedish-language sources (sv)
- item-413 at level 4: list_item: Articles with short description
- item-414 at level 4: list_item: Short description is different from Wikidata
- item-415 at level 4: list_item: Wikipedia indefinitely move-protected pages
- item-416 at level 4: list_item: Wikipedia indefinitely semi-protected pages
- item-417 at level 4: list_item: Articles with 'species' microformats
- item-418 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
- item-419 at level 4: list_item: Articles containing Dutch-language text
- item-420 at level 4: list_item: Articles containing German-language text
- item-421 at level 4: list_item: Articles containing Norwegian-language text
- item-422 at level 4: list_item: Articles containing Lithuanian-language text
- item-423 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
- item-424 at level 4: list_item: All articles with self-published sources
- item-425 at level 4: list_item: Articles with self-published sources from February 2020
- item-426 at level 4: list_item: All articles with unsourced statements
- item-427 at level 4: list_item: Articles with unsourced statements from January 2022
- item-428 at level 4: list_item: CS1: long volume value
- item-429 at level 4: list_item: Pages using Sister project links with wikidata mismatch
- item-430 at level 4: list_item: Pages using Sister project links with hidden wikidata
- item-431 at level 4: list_item: Webarchive template wayback links
- item-432 at level 4: list_item: Articles with Project Gutenberg links
- item-433 at level 4: list_item: Articles containing video clips
- item-434 at level 3: list: group list
- item-435 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
- item-436 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
- item-437 at level 3: list: group list
- item-438 at level 4: list_item: Privacy policy
- item-439 at level 4: list_item: About Wikipedia
- item-440 at level 4: list_item: Disclaimers
- item-441 at level 4: list_item: Contact Wikipedia
- item-442 at level 4: list_item: Code of Conduct
- item-443 at level 4: list_item: Developers
- item-444 at level 4: list_item: Statistics
- item-445 at level 4: list_item: Cookie statement
- item-446 at level 4: list_item: Mobile view
+ item-399 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
+ item-400 at level 3: table with [3x2]
+ item-401 at level 3: picture
+ item-402 at level 3: list: group list
+ item-403 at level 4: list_item: Ducks
+ item-404 at level 4: list_item: Game birds
+ item-405 at level 4: list_item: Bird common names
+ item-406 at level 3: list: group list
+ item-407 at level 4: list_item: All accuracy disputes
+ item-408 at level 4: list_item: Accuracy disputes from February 2020
+ item-409 at level 4: list_item: CS1 Finnish-language sources (fi)
+ item-410 at level 4: list_item: CS1 Latvian-language sources (lv)
+ item-411 at level 4: list_item: CS1 Swedish-language sources (sv)
+ item-412 at level 4: list_item: Articles with short description
+ item-413 at level 4: list_item: Short description is different from Wikidata
+ item-414 at level 4: list_item: Wikipedia indefinitely move-protected pages
+ item-415 at level 4: list_item: Wikipedia indefinitely semi-protected pages
+ item-416 at level 4: list_item: Articles with 'species' microformats
+ item-417 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
+ item-418 at level 4: list_item: Articles containing Dutch-language text
+ item-419 at level 4: list_item: Articles containing German-language text
+ item-420 at level 4: list_item: Articles containing Norwegian-language text
+ item-421 at level 4: list_item: Articles containing Lithuanian-language text
+ item-422 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
+ item-423 at level 4: list_item: All articles with self-published sources
+ item-424 at level 4: list_item: Articles with self-published sources from February 2020
+ item-425 at level 4: list_item: All articles with unsourced statements
+ item-426 at level 4: list_item: Articles with unsourced statements from January 2022
+ item-427 at level 4: list_item: CS1: long volume value
+ item-428 at level 4: list_item: Pages using Sister project links with wikidata mismatch
+ item-429 at level 4: list_item: Pages using Sister project links with hidden wikidata
+ item-430 at level 4: list_item: Webarchive template wayback links
+ item-431 at level 4: list_item: Articles with Project Gutenberg links
+ item-432 at level 4: list_item: Articles containing video clips
+ item-433 at level 3: list: group list
+ item-434 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
+ item-435 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
+ item-436 at level 3: list: group list
+ item-437 at level 4: list_item: Privacy policy
+ item-438 at level 4: list_item: About Wikipedia
+ item-439 at level 4: list_item: Disclaimers
+ item-440 at level 4: list_item: Contact Wikipedia
+ item-441 at level 4: list_item: Code of Conduct
+ item-442 at level 4: list_item: Developers
+ item-443 at level 4: list_item: Statistics
+ item-444 at level 4: list_item: Cookie statement
+ item-445 at level 4: list_item: Mobile view
+ item-446 at level 3: list: group list
item-447 at level 3: list: group list
- item-448 at level 4: list_item:
- item-449 at level 4: list_item:
- item-450 at level 3: list: group list
- item-451 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
- item-452 at level 1: caption: Male mallard.
- item-453 at level 1: caption: Wood ducks.
- item-454 at level 1: caption: Mallard landing in approach
- item-455 at level 1: caption: Male Mandarin duck
- item-456 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
- item-457 at level 1: caption: Female mallard in Cornwall, England
- item-458 at level 1: caption: Pecten along the bill
- item-459 at level 1: caption: Mallard duckling preening
- item-460 at level 1: caption: A Muscovy duckling
- item-461 at level 1: caption: Ringed teal
- item-462 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
- item-463 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
\ No newline at end of file
+ item-448 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
+ item-449 at level 1: caption: Male mallard.
+ item-450 at level 1: caption: Wood ducks.
+ item-451 at level 1: caption: Mallard landing in approach
+ item-452 at level 1: caption: Male Mandarin duck
+ item-453 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
+ item-454 at level 1: caption: Female mallard in Cornwall, England
+ item-455 at level 1: caption: Pecten along the bill
+ item-456 at level 1: caption: Mallard duckling preening
+ item-457 at level 1: caption: A Muscovy duckling
+ item-458 at level 1: caption: Ringed teal
+ item-459 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
+ item-460 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
index 196c903..e59c18f 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
@@ -1413,9 +1413,6 @@
},
{
"$ref": "#/texts/350"
- },
- {
- "$ref": "#/texts/351"
}
],
"content_layer": "body",
@@ -1428,14 +1425,14 @@
"$ref": "#/texts/341"
},
"children": [
+ {
+ "$ref": "#/texts/351"
+ },
{
"$ref": "#/texts/352"
},
{
"$ref": "#/texts/353"
- },
- {
- "$ref": "#/texts/354"
}
],
"content_layer": "body",
@@ -1448,6 +1445,9 @@
"$ref": "#/texts/341"
},
"children": [
+ {
+ "$ref": "#/texts/354"
+ },
{
"$ref": "#/texts/355"
},
@@ -1522,9 +1522,6 @@
},
{
"$ref": "#/texts/379"
- },
- {
- "$ref": "#/texts/380"
}
],
"content_layer": "body",
@@ -1538,10 +1535,10 @@
},
"children": [
{
- "$ref": "#/texts/381"
+ "$ref": "#/texts/380"
},
{
- "$ref": "#/texts/382"
+ "$ref": "#/texts/381"
}
],
"content_layer": "body",
@@ -1554,6 +1551,9 @@
"$ref": "#/texts/341"
},
"children": [
+ {
+ "$ref": "#/texts/382"
+ },
{
"$ref": "#/texts/383"
},
@@ -1577,9 +1577,6 @@
},
{
"$ref": "#/texts/390"
- },
- {
- "$ref": "#/texts/391"
}
],
"content_layer": "body",
@@ -1591,14 +1588,7 @@
"parent": {
"$ref": "#/texts/341"
},
- "children": [
- {
- "$ref": "#/texts/392"
- },
- {
- "$ref": "#/texts/393"
- }
- ],
+ "children": [],
"content_layer": "body",
"name": "list",
"label": "list"
@@ -6774,27 +6764,13 @@
"content_layer": "body",
"label": "list_item",
"prov": [],
- "orig": "",
- "text": "",
- "enumerated": false,
- "marker": "-"
- },
- {
- "self_ref": "#/texts/351",
- "parent": {
- "$ref": "#/groups/42"
- },
- "children": [],
- "content_layer": "body",
- "label": "list_item",
- "prov": [],
"orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
"text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
"enumerated": false,
"marker": "-"
},
{
- "self_ref": "#/texts/352",
+ "self_ref": "#/texts/351",
"parent": {
"$ref": "#/groups/43"
},
@@ -6808,7 +6784,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/353",
+ "self_ref": "#/texts/352",
"parent": {
"$ref": "#/groups/43"
},
@@ -6822,7 +6798,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/354",
+ "self_ref": "#/texts/353",
"parent": {
"$ref": "#/groups/43"
},
@@ -6836,7 +6812,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/355",
+ "self_ref": "#/texts/354",
"parent": {
"$ref": "#/groups/44"
},
@@ -6850,7 +6826,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/356",
+ "self_ref": "#/texts/355",
"parent": {
"$ref": "#/groups/44"
},
@@ -6864,7 +6840,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/357",
+ "self_ref": "#/texts/356",
"parent": {
"$ref": "#/groups/44"
},
@@ -6878,7 +6854,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/358",
+ "self_ref": "#/texts/357",
"parent": {
"$ref": "#/groups/44"
},
@@ -6892,7 +6868,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/359",
+ "self_ref": "#/texts/358",
"parent": {
"$ref": "#/groups/44"
},
@@ -6906,7 +6882,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/360",
+ "self_ref": "#/texts/359",
"parent": {
"$ref": "#/groups/44"
},
@@ -6920,7 +6896,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/361",
+ "self_ref": "#/texts/360",
"parent": {
"$ref": "#/groups/44"
},
@@ -6934,7 +6910,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/362",
+ "self_ref": "#/texts/361",
"parent": {
"$ref": "#/groups/44"
},
@@ -6948,7 +6924,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/363",
+ "self_ref": "#/texts/362",
"parent": {
"$ref": "#/groups/44"
},
@@ -6962,7 +6938,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/364",
+ "self_ref": "#/texts/363",
"parent": {
"$ref": "#/groups/44"
},
@@ -6976,7 +6952,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/365",
+ "self_ref": "#/texts/364",
"parent": {
"$ref": "#/groups/44"
},
@@ -6990,7 +6966,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/366",
+ "self_ref": "#/texts/365",
"parent": {
"$ref": "#/groups/44"
},
@@ -7004,7 +6980,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/367",
+ "self_ref": "#/texts/366",
"parent": {
"$ref": "#/groups/44"
},
@@ -7018,7 +6994,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/368",
+ "self_ref": "#/texts/367",
"parent": {
"$ref": "#/groups/44"
},
@@ -7032,7 +7008,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/369",
+ "self_ref": "#/texts/368",
"parent": {
"$ref": "#/groups/44"
},
@@ -7046,7 +7022,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/370",
+ "self_ref": "#/texts/369",
"parent": {
"$ref": "#/groups/44"
},
@@ -7060,7 +7036,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/371",
+ "self_ref": "#/texts/370",
"parent": {
"$ref": "#/groups/44"
},
@@ -7074,7 +7050,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/372",
+ "self_ref": "#/texts/371",
"parent": {
"$ref": "#/groups/44"
},
@@ -7088,7 +7064,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/373",
+ "self_ref": "#/texts/372",
"parent": {
"$ref": "#/groups/44"
},
@@ -7102,7 +7078,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/374",
+ "self_ref": "#/texts/373",
"parent": {
"$ref": "#/groups/44"
},
@@ -7116,7 +7092,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/375",
+ "self_ref": "#/texts/374",
"parent": {
"$ref": "#/groups/44"
},
@@ -7130,7 +7106,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/376",
+ "self_ref": "#/texts/375",
"parent": {
"$ref": "#/groups/44"
},
@@ -7144,7 +7120,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/377",
+ "self_ref": "#/texts/376",
"parent": {
"$ref": "#/groups/44"
},
@@ -7158,7 +7134,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/378",
+ "self_ref": "#/texts/377",
"parent": {
"$ref": "#/groups/44"
},
@@ -7172,7 +7148,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/379",
+ "self_ref": "#/texts/378",
"parent": {
"$ref": "#/groups/44"
},
@@ -7186,7 +7162,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/380",
+ "self_ref": "#/texts/379",
"parent": {
"$ref": "#/groups/44"
},
@@ -7200,7 +7176,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/381",
+ "self_ref": "#/texts/380",
"parent": {
"$ref": "#/groups/45"
},
@@ -7214,7 +7190,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/382",
+ "self_ref": "#/texts/381",
"parent": {
"$ref": "#/groups/45"
},
@@ -7228,7 +7204,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/383",
+ "self_ref": "#/texts/382",
"parent": {
"$ref": "#/groups/46"
},
@@ -7242,7 +7218,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/384",
+ "self_ref": "#/texts/383",
"parent": {
"$ref": "#/groups/46"
},
@@ -7256,7 +7232,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/385",
+ "self_ref": "#/texts/384",
"parent": {
"$ref": "#/groups/46"
},
@@ -7270,7 +7246,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/386",
+ "self_ref": "#/texts/385",
"parent": {
"$ref": "#/groups/46"
},
@@ -7284,7 +7260,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/387",
+ "self_ref": "#/texts/386",
"parent": {
"$ref": "#/groups/46"
},
@@ -7298,7 +7274,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/388",
+ "self_ref": "#/texts/387",
"parent": {
"$ref": "#/groups/46"
},
@@ -7312,7 +7288,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/389",
+ "self_ref": "#/texts/388",
"parent": {
"$ref": "#/groups/46"
},
@@ -7326,7 +7302,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/390",
+ "self_ref": "#/texts/389",
"parent": {
"$ref": "#/groups/46"
},
@@ -7340,7 +7316,7 @@
"marker": "-"
},
{
- "self_ref": "#/texts/391",
+ "self_ref": "#/texts/390",
"parent": {
"$ref": "#/groups/46"
},
@@ -7352,34 +7328,6 @@
"text": "Mobile view",
"enumerated": false,
"marker": "-"
- },
- {
- "self_ref": "#/texts/392",
- "parent": {
- "$ref": "#/groups/47"
- },
- "children": [],
- "content_layer": "body",
- "label": "list_item",
- "prov": [],
- "orig": "",
- "text": "",
- "enumerated": false,
- "marker": "-"
- },
- {
- "self_ref": "#/texts/393",
- "parent": {
- "$ref": "#/groups/47"
- },
- "children": [],
- "content_layer": "body",
- "label": "list_item",
- "prov": [],
- "orig": "",
- "text": "",
- "enumerated": false,
- "marker": "-"
}
],
"pictures": [
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
index df4554f..bd3f3c3 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
@@ -473,7 +473,6 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a
- list of books (useful looking abstracts)
- Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
--
- Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl
| Authority control databases | Authority control databases |
@@ -526,7 +525,4 @@ additional terms may apply. By using this site, you agree to the Terms of Use an
- Developers
- Statistics
- Cookie statement
-- Mobile view
-
--
--
\ No newline at end of file
+- Mobile view
\ No newline at end of file
| |