refactor: upgrade BeautifulSoup4 with type hints (#999)

* refactor: upgrade BeautifulSoup4 with type hints

Upgrade dependency library BeautifulSoup4 to 4.13.3 (with type hints).
Refactor backends using BeautifulSoup4 to comply with type hints.
Apply style simplifications and improvements for consistency.
Remove variables and functions that are never used.
Remove code duplication between backends for parsing HTML tables.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* build: allow beautifulsoup4 version 4.12.3

Allow older version of beautifulsoup4 and ensure compatibility.
Update library dependencies.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-02-18 11:30:47 +01:00 committed by GitHub
parent 75db61127c
commit 7450050ace
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 328 additions and 425 deletions

View File

@ -1,9 +1,9 @@
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Optional, Set, Union from typing import Optional, Union, cast
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
@ -12,6 +12,7 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -21,6 +22,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend): class HTMLDocumentBackend(DeclarativeDocumentBackend):
@override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...") _log.debug("About to init HTML backend...")
@ -48,13 +50,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
f"Could not initialize HTML backend for file with hash {self.document_hash}." f"Could not initialize HTML backend for file with hash {self.document_hash}."
) from e ) from e
@override
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.soup is not None return self.soup is not None
@classmethod @classmethod
@override
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
return False return False
@override
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close() self.path_or_stream.close()
@ -62,9 +67,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.path_or_stream = None self.path_or_stream = None
@classmethod @classmethod
def supported_formats(cls) -> Set[InputFormat]: @override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.HTML} return {InputFormat.HTML}
@override
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
origin = DocumentOrigin( origin = DocumentOrigin(
@ -80,98 +87,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
assert self.soup is not None assert self.soup is not None
content = self.soup.body or self.soup content = self.soup.body or self.soup
# Replace <br> tags with newline characters # Replace <br> tags with newline characters
for br in content.find_all("br"): for br in content("br"):
br.replace_with("\n") br.replace_with(NavigableString("\n"))
doc = self.walk(content, doc) self.walk(content, doc)
else: else:
raise RuntimeError( raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init." f"Cannot convert doc with {self.document_hash} because the backend failed to init."
) )
return doc return doc
def walk(self, element: Tag, doc: DoclingDocument): def walk(self, tag: Tag, doc: DoclingDocument) -> None:
try: # Iterate over elements in the body of the document
# Iterate over elements in the body of the document for element in tag.children:
for idx, element in enumerate(element.children): if isinstance(element, Tag):
try: try:
self.analyse_element(element, idx, doc) self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child: except Exception as exc_child:
_log.error(
_log.error(" -> error treating child: ", exc_child) f"Error processing child from tag{tag.name}: {exc_child}"
_log.error(" => element: ", element, "\n") )
raise exc_child raise exc_child
except Exception as exc: return
pass
return doc def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
if tag.name in self.labels:
def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument): self.labels[tag.name] += 1
"""
if element.name!=None:
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
"""
if element.name in self.labels:
self.labels[element.name] += 1
else: else:
self.labels[element.name] = 1 self.labels[tag.name] = 1
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(element, idx, doc) self.handle_header(tag, doc)
elif element.name in ["p"]: elif tag.name in ["p"]:
self.handle_paragraph(element, idx, doc) self.handle_paragraph(tag, doc)
elif element.name in ["pre"]: elif tag.name in ["pre"]:
self.handle_code(element, idx, doc) self.handle_code(tag, doc)
elif element.name in ["ul", "ol"]: elif tag.name in ["ul", "ol"]:
self.handle_list(element, idx, doc) self.handle_list(tag, doc)
elif element.name in ["li"]: elif tag.name in ["li"]:
self.handle_listitem(element, idx, doc) self.handle_list_item(tag, doc)
elif element.name == "table": elif tag.name == "table":
self.handle_table(element, idx, doc) self.handle_table(tag, doc)
elif element.name == "figure": elif tag.name == "figure":
self.handle_figure(element, idx, doc) self.handle_figure(tag, doc)
elif element.name == "img": elif tag.name == "img":
self.handle_image(element, idx, doc) self.handle_image(doc)
else: else:
self.walk(element, doc) self.walk(tag, doc)
def get_direct_text(self, item: Tag): def get_text(self, item: PageElement) -> str:
"""Get the direct text of the <li> element (ignoring nested lists).""" """Get the text content of a tag."""
text = item.find(string=True, recursive=False) parts: list[str] = self.extract_text_recursively(item)
if isinstance(text, str):
return text.strip()
return "" return "".join(parts) + " "
# Function to recursively extract text from all child nodes # Function to recursively extract text from all child nodes
def extract_text_recursively(self, item: Tag): def extract_text_recursively(self, item: PageElement) -> list[str]:
result = [] result: list[str] = []
if isinstance(item, str): if isinstance(item, NavigableString):
return [item] return [item]
if item.name not in ["ul", "ol"]: tag = cast(Tag, item)
try: if tag.name not in ["ul", "ol"]:
# Iterate over the children (and their text and tails) for child in tag:
for child in item: # Recursively get the child's text content
try: result.extend(self.extract_text_recursively(child))
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
return "".join(result) + " " return ["".join(result) + " "]
def handle_header(self, element: Tag, idx: int, doc: DoclingDocument): def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles header tags (h1, h2, etc.).""" """Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", "")) hlevel = int(element.name.replace("h", ""))
slevel = hlevel - 1
label = DocItemLabel.SECTION_HEADER
text = element.text.strip() text = element.text.strip()
if hlevel == 1: if hlevel == 1:
@ -197,7 +184,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif hlevel < self.level: elif hlevel < self.level:
# remove the tail # remove the tail
for key, val in self.parents.items(): for key in self.parents.keys():
if key > hlevel: if key > hlevel:
self.parents[key] = None self.parents[key] = None
self.level = hlevel self.level = hlevel
@ -208,27 +195,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
level=hlevel, level=hlevel,
) )
def handle_code(self, element: Tag, idx: int, doc: DoclingDocument): def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles monospace code snippets (pre).""" """Handles monospace code snippets (pre)."""
if element.text is None: if element.text is None:
return return
text = element.text.strip() text = element.text.strip()
label = DocItemLabel.CODE if text:
if len(text) == 0: doc.add_code(parent=self.parents[self.level], text=text)
return
doc.add_code(parent=self.parents[self.level], text=text)
def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument): def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p).""" """Handles paragraph tags (p)."""
if element.text is None: if element.text is None:
return return
text = element.text.strip() text = element.text.strip()
label = DocItemLabel.PARAGRAPH label = DocItemLabel.PARAGRAPH
if len(text) == 0: if text:
return doc.add_text(parent=self.parents[self.level], label=label, text=text)
doc.add_text(parent=self.parents[self.level], label=label, text=text)
def handle_list(self, element: Tag, idx: int, doc: DoclingDocument): def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items.""" """Handles list tags (ul, ol) and their list items."""
if element.name == "ul": if element.name == "ul":
@ -250,18 +234,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = None self.parents[self.level + 1] = None
self.level -= 1 self.level -= 1
def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument): def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles listitem tags (li).""" """Handles listitem tags (li)."""
nested_lists = element.find(["ul", "ol"]) nested_list = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1 index_in_list = len(self.parents[self.level].children) + 1
if nested_lists: if nested_list:
name = element.name
# Text in list item can be hidden within hierarchy, hence # Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively # we need to extract it recursively
text = self.extract_text_recursively(element) text: str = self.get_text(element)
# Flatten text, remove break lines: # Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "") text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip() text = " ".join(text.split()).strip()
@ -287,7 +270,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = None self.parents[self.level + 1] = None
self.level -= 1 self.level -= 1
elif isinstance(element.text, str): elif element.text.strip():
text = element.text.strip() text = element.text.strip()
marker = "" marker = ""
@ -302,59 +285,79 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level], parent=self.parents[self.level],
) )
else: else:
_log.warn("list-item has no text: ", element) _log.warning(f"list-item has no text: {element}")
def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
"""Handles table tags."""
@staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]:
nested_tables = element.find("table") nested_tables = element.find("table")
if nested_tables is not None: if nested_tables is not None:
_log.warn("detected nested tables: skipping for now") _log.warning("Skipping nested table.")
return return None
# Count the number of rows (number of <tr> elements) # Count the number of rows (number of <tr> elements)
num_rows = len(element.find_all("tr")) num_rows = len(element("tr"))
# Find the number of columns (taking into account colspan) # Find the number of columns (taking into account colspan)
num_cols = 0 num_cols = 0
for row in element.find_all("tr"): for row in element("tr"):
col_count = 0 col_count = 0
for cell in row.find_all(["td", "th"]): if not isinstance(row, Tag):
colspan = int(cell.get("colspan", 1)) continue
for cell in row(["td", "th"]):
if not isinstance(row, Tag):
continue
val = cast(Tag, cell).get("colspan", "1")
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
col_count += colspan col_count += colspan
num_cols = max(num_cols, col_count) num_cols = max(num_cols, col_count)
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table # Iterate over the rows in the table
for row_idx, row in enumerate(element.find_all("tr")): for row_idx, row in enumerate(element("tr")):
if not isinstance(row, Tag):
continue
# For each row, find all the column cells (both <td> and <th>) # For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"]) cells = row(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header # Check if each cell in the row is a header -> means it is a column header
col_header = True col_header = True
for j, html_cell in enumerate(cells): for html_cell in cells:
if html_cell.name == "td": if isinstance(html_cell, Tag) and html_cell.name == "td":
col_header = False col_header = False
# Extract the text content of each cell
col_idx = 0 col_idx = 0
# Extract and print the text content of each cell for html_cell in cells:
for _, html_cell in enumerate(cells): if not isinstance(html_cell, Tag):
continue
# extract inline formulas
for formula in html_cell("inline-formula"):
math_parts = formula.text.split("$$")
if len(math_parts) == 3:
math_formula = f"$${math_parts[1]}$$"
formula.replace_with(NavigableString(math_formula))
# TODO: extract content correctly from table-cells with lists
text = html_cell.text text = html_cell.text
try:
text = self.extract_table_cell_text(html_cell)
except Exception as exc:
_log.warn("exception: ", exc)
exit(-1)
# label = html_cell.name # label = html_cell.name
col_val = html_cell.get("colspan", "1")
col_span = int(html_cell.get("colspan", 1)) col_span = (
row_span = int(html_cell.get("rowspan", 1)) int(col_val)
if isinstance(col_val, str) and col_val.isnumeric()
else 1
)
row_val = html_cell.get("rowspan", "1")
row_span = (
int(row_val)
if isinstance(row_val, str) and row_val.isnumeric()
else 1
)
while grid[row_idx][col_idx] is not None: while grid[row_idx][col_idx] is not None:
col_idx += 1 col_idx += 1
@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for c in range(col_span): for c in range(col_span):
grid[row_idx + r][col_idx + c] = text grid[row_idx + r][col_idx + c] = text
cell = TableCell( table_cell = TableCell(
text=text, text=text,
row_span=row_span, row_span=row_span,
col_span=col_span, col_span=col_span,
@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
col_header=col_header, col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"), row_header=((not col_header) and html_cell.name == "th"),
) )
data.table_cells.append(cell) data.table_cells.append(table_cell)
doc.add_table(data=data, parent=self.parents[self.level]) return data
def get_list_text(self, list_element: Tag, level=0): def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles table tags."""
table_data = HTMLDocumentBackend.parse_table_data(element)
if table_data is not None:
doc.add_table(data=table_data, parent=self.parents[self.level])
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from <ul> or <ol> with proper indentation.""" """Recursively extract text from <ul> or <ol> with proper indentation."""
result = [] result = []
bullet_char = "*" # Default bullet character for unordered lists bullet_char = "*" # Default bullet character for unordered lists
if list_element.name == "ol": # For ordered lists, use numbers if list_element.name == "ol": # For ordered lists, use numbers
for i, li in enumerate(list_element.find_all("li", recursive=False), 1): for i, li in enumerate(list_element("li", recursive=False), 1):
if not isinstance(li, Tag):
continue
# Add numbering for ordered lists # Add numbering for ordered lists
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}") result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
# Handle nested lists # Handle nested lists
nested_list = li.find(["ul", "ol"]) nested_list = li.find(["ul", "ol"])
if nested_list: if isinstance(nested_list, Tag):
result.extend(self.get_list_text(nested_list, level + 1)) result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element.find_all("li", recursive=False): for li in list_element("li", recursive=False):
if not isinstance(li, Tag):
continue
# Add bullet points for unordered lists # Add bullet points for unordered lists
result.append( result.append(
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}" f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
) )
# Handle nested lists # Handle nested lists
nested_list = li.find(["ul", "ol"]) nested_list = li.find(["ul", "ol"])
if nested_list: if isinstance(nested_list, Tag):
result.extend(self.get_list_text(nested_list, level + 1)) result.extend(self.get_list_text(nested_list, level + 1))
return result return result
def extract_table_cell_text(self, cell: Tag): def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
"""Extract text from a table cell, including lists with indents."""
contains_lists = cell.find(["ul", "ol"])
if contains_lists is None:
return cell.text
else:
_log.debug(
"should extract the content correctly for table-cells with lists ..."
)
return cell.text
def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
"""Handles image tags (img).""" """Handles image tags (img)."""
# Extract the image URI from the <img> tag # Extract the image URI from the <img> tag
# image_uri = root.xpath('//figure//img/@src')[0] # image_uri = root.xpath('//figure//img/@src')[0]
contains_captions = element.find(["figcaption"]) contains_captions = element.find(["figcaption"])
if contains_captions is None: if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None) doc.add_picture(parent=self.parents[self.level], caption=None)
else: else:
texts = [] texts = []
for item in contains_captions: for item in contains_captions:
@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
caption=fig_caption, caption=fig_caption,
) )
def handle_image(self, element: Tag, idx, doc: DoclingDocument): def handle_image(self, doc: DoclingDocument) -> None:
"""Handles image tags (img).""" """Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None) doc.add_picture(parent=self.parents[self.level], caption=None)

View File

@ -4,7 +4,7 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Final, Optional, Union from typing import Final, Optional, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
@ -12,14 +12,13 @@ from docling_core.types.doc import (
GroupItem, GroupItem,
GroupLabel, GroupLabel,
NodeItem, NodeItem,
TableCell,
TableData,
TextItem, TextItem,
) )
from lxml import etree from lxml import etree
from typing_extensions import TypedDict, override from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
) -> None: ) -> None:
soup = BeautifulSoup(table_xml_component["content"], "html.parser") soup = BeautifulSoup(table_xml_component["content"], "html.parser")
table_tag = soup.find("table") table_tag = soup.find("table")
if not isinstance(table_tag, Tag):
nested_tables = table_tag.find("table")
if nested_tables:
_log.warning(f"Skipping nested table in {str(self.file)}")
return return
# Count the number of rows (number of <tr> elements) data = HTMLDocumentBackend.parse_table_data(table_tag)
num_rows = len(table_tag.find_all("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
for row in table_tag.find_all("tr"):
col_count = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
col_count += colspan
num_cols = max(num_cols, col_count)
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
for row_idx, row in enumerate(table_tag.find_all("tr")):
# For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
for j, html_cell in enumerate(cells):
if html_cell.name == "td":
col_header = False
# Extract and print the text content of each cell
col_idx = 0
for _, html_cell in enumerate(cells):
# extract inline formulas
for formula in html_cell.find_all("inline-formula"):
math_parts = formula.text.split("$$")
if len(math_parts) == 3:
math_formula = f"$${math_parts[1]}$$"
formula.replaceWith(math_formula)
text = html_cell.text
col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
while grid[row_idx][col_idx] is not None:
col_idx += 1
for r in range(row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(cell)
# TODO: format label vs caption once styling is supported # TODO: format label vs caption once styling is supported
label = table_xml_component["label"] label = table_xml_component["label"]
@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
else None else None
) )
doc.add_table(data=data, parent=parent, caption=table_caption) if data is not None:
doc.add_table(data=data, parent=parent, caption=table_caption)
return return
@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
def _walk_linear( def _walk_linear(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str: ) -> str:
# _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
skip_tags = ["term"] skip_tags = ["term"]
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"] flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
new_parent: NodeItem = parent new_parent: NodeItem = parent

View File

@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
from enum import Enum, unique from enum import Enum, unique
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Any, Final, Optional, Union from typing import Final, Optional, Union
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import ( from docling_core.types.doc import (
@ -1406,6 +1406,10 @@ class XmlTable:
http://oasis-open.org/specs/soextblx.dtd http://oasis-open.org/specs/soextblx.dtd
""" """
class ColInfo(TypedDict):
ncols: int
colinfo: list[dict]
class MinColInfoType(TypedDict): class MinColInfoType(TypedDict):
offset: list[int] offset: list[int]
colwidth: list[int] colwidth: list[int]
@ -1425,7 +1429,7 @@ class XmlTable:
self.empty_text = "" self.empty_text = ""
self._soup = BeautifulSoup(input, features="xml") self._soup = BeautifulSoup(input, features="xml")
def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]: def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
"""Create a unified range along the table groups. """Create a unified range along the table groups.
Args: Args:
@ -1532,19 +1536,26 @@ class XmlTable:
Returns: Returns:
A docling table object. A docling table object.
""" """
tgs_align = [] tgs_align: list[XmlTable.ColInfo] = []
tg_secs = table.find_all("tgroup") tg_secs = table("tgroup")
if tg_secs: if tg_secs:
for tg_sec in tg_secs: for tg_sec in tg_secs:
ncols = tg_sec.get("cols", None) if not isinstance(tg_sec, Tag):
if ncols: continue
ncols = int(ncols) col_val = tg_sec.get("cols")
tg_align = {"ncols": ncols, "colinfo": []} ncols = (
cs_secs = tg_sec.find_all("colspec") int(col_val)
if isinstance(col_val, str) and col_val.isnumeric()
else 1
)
tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
cs_secs = tg_sec("colspec")
if cs_secs: if cs_secs:
for cs_sec in cs_secs: for cs_sec in cs_secs:
colname = cs_sec.get("colname", None) if not isinstance(cs_sec, Tag):
colwidth = cs_sec.get("colwidth", None) continue
colname = cs_sec.get("colname")
colwidth = cs_sec.get("colwidth")
tg_align["colinfo"].append( tg_align["colinfo"].append(
{"colname": colname, "colwidth": colwidth} {"colname": colname, "colwidth": colwidth}
) )
@ -1565,16 +1576,23 @@ class XmlTable:
table_data: list[TableCell] = [] table_data: list[TableCell] = []
i_row_global = 0 i_row_global = 0
is_row_empty: bool = True is_row_empty: bool = True
tg_secs = table.find_all("tgroup") tg_secs = table("tgroup")
if tg_secs: if tg_secs:
for itg, tg_sec in enumerate(tg_secs): for itg, tg_sec in enumerate(tg_secs):
if not isinstance(tg_sec, Tag):
continue
tg_range = tgs_range[itg] tg_range = tgs_range[itg]
row_secs = tg_sec.find_all(["row", "tr"]) row_secs = tg_sec(["row", "tr"])
if row_secs: if row_secs:
for row_sec in row_secs: for row_sec in row_secs:
entry_secs = row_sec.find_all(["entry", "td"]) if not isinstance(row_sec, Tag):
is_header: bool = row_sec.parent.name in ["thead"] continue
entry_secs = row_sec(["entry", "td"])
is_header: bool = (
row_sec.parent is not None
and row_sec.parent.name == "thead"
)
ncols = 0 ncols = 0
local_row: list[TableCell] = [] local_row: list[TableCell] = []
@ -1582,23 +1600,26 @@ class XmlTable:
if entry_secs: if entry_secs:
wrong_nbr_cols = False wrong_nbr_cols = False
for ientry, entry_sec in enumerate(entry_secs): for ientry, entry_sec in enumerate(entry_secs):
if not isinstance(entry_sec, Tag):
continue
text = entry_sec.get_text().strip() text = entry_sec.get_text().strip()
# start-end # start-end
namest = entry_sec.attrs.get("namest", None) namest = entry_sec.get("namest")
nameend = entry_sec.attrs.get("nameend", None) nameend = entry_sec.get("nameend")
if isinstance(namest, str) and namest.isnumeric(): start = (
namest = int(namest) int(namest)
else: if isinstance(namest, str) and namest.isnumeric()
namest = ientry + 1 else ientry + 1
)
if isinstance(nameend, str) and nameend.isnumeric(): if isinstance(nameend, str) and nameend.isnumeric():
nameend = int(nameend) end = int(nameend)
shift = 0 shift = 0
else: else:
nameend = ientry + 2 end = ientry + 2
shift = 1 shift = 1
if nameend > len(tg_range["cell_offst"]): if end > len(tg_range["cell_offst"]):
wrong_nbr_cols = True wrong_nbr_cols = True
self.nbr_messages += 1 self.nbr_messages += 1
if self.nbr_messages <= self.max_nbr_messages: if self.nbr_messages <= self.max_nbr_messages:
@ -1608,8 +1629,8 @@ class XmlTable:
break break
range_ = [ range_ = [
tg_range["cell_offst"][namest - 1], tg_range["cell_offst"][start - 1],
tg_range["cell_offst"][nameend - 1] - shift, tg_range["cell_offst"][end - 1] - shift,
] ]
# add row and replicate cell if needed # add row and replicate cell if needed
@ -1668,7 +1689,7 @@ class XmlTable:
A docling table data. A docling table data.
""" """
section = self._soup.find("table") section = self._soup.find("table")
if section is not None: if isinstance(section, Tag):
table = self._parse_table(section) table = self._parse_table(section)
if table.num_rows == 0 or table.num_cols == 0: if table.num_rows == 0 or table.num_cols == 0:
_log.warning("The parsed USPTO table is empty") _log.warning("The parsed USPTO table is empty")

41
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
[[package]] [[package]]
name = "aiohappyeyeballs" name = "aiohappyeyeballs"
@ -282,17 +282,18 @@ testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-ch
[[package]] [[package]]
name = "beautifulsoup4" name = "beautifulsoup4"
version = "4.12.3" version = "4.13.3"
description = "Screen-scraping library" description = "Screen-scraping library"
optional = false optional = false
python-versions = ">=3.6.0" python-versions = ">=3.7.0"
files = [ files = [
{file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
{file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
] ]
[package.dependencies] [package.dependencies]
soupsieve = ">1.2" soupsieve = ">1.2"
typing-extensions = ">=4.0.0"
[package.extras] [package.extras]
cchardet = ["cchardet"] cchardet = ["cchardet"]
@ -866,13 +867,13 @@ files = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "2.19.0" version = "2.19.1"
description = "A python library to define and validate data types in Docling." description = "A python library to define and validate data types in Docling."
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_core-2.19.0-py3-none-any.whl", hash = "sha256:caa1e13d98fa9a00608091c386609c75b3560c7291e842c252f0b6f8d5812dbd"}, {file = "docling_core-2.19.1-py3-none-any.whl", hash = "sha256:ca7bd4dacd75611c5ea4f205192b71a8f22205e615eff1a16aac7082644d3b2e"},
{file = "docling_core-2.19.0.tar.gz", hash = "sha256:ebf3062e31155bb5f0e6132056a2d239a0e6e693a75c5758886909bb9fef461a"}, {file = "docling_core-2.19.1.tar.gz", hash = "sha256:e2769b816c669cdf27024dd3b219d3ecaf2161691dd5e8e5e8ce439557ea0928"},
] ]
[package.dependencies] [package.dependencies]
@ -1357,13 +1358,13 @@ colorama = ">=0.4"
[[package]] [[package]]
name = "griffe-pydantic" name = "griffe-pydantic"
version = "1.1.0" version = "1.1.2"
description = "Griffe extension for Pydantic." description = "Griffe extension for Pydantic."
optional = false optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
files = [ files = [
{file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"}, {file = "griffe_pydantic-1.1.2-py3-none-any.whl", hash = "sha256:8ad53218ca6e9c24ccec83588eb435f562b30355f641fe336e81b1e00ea05f3c"},
{file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"}, {file = "griffe_pydantic-1.1.2.tar.gz", hash = "sha256:381eacd8854a85811522b4f6dc9a1ef0fb5931825081379d70ff3a425b0d4ea1"},
] ]
[package.dependencies] [package.dependencies]
@ -7052,18 +7053,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
[[package]] [[package]]
name = "transformers" name = "transformers"
version = "4.48.3" version = "4.49.0"
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
optional = false optional = false
python-versions = ">=3.9.0" python-versions = ">=3.9.0"
files = [ files = [
{file = "transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36"}, {file = "transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03"},
{file = "transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb"}, {file = "transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e"},
] ]
[package.dependencies] [package.dependencies]
filelock = "*" filelock = "*"
huggingface-hub = ">=0.24.0,<1.0" huggingface-hub = ">=0.26.0,<1.0"
numpy = ">=1.17" numpy = ">=1.17"
packaging = ">=20.0" packaging = ">=20.0"
pyyaml = ">=5.1" pyyaml = ">=5.1"
@ -7076,13 +7077,13 @@ tqdm = ">=4.27"
[package.extras] [package.extras]
accelerate = ["accelerate (>=0.26.0)"] accelerate = ["accelerate (>=0.26.0)"]
agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"] agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"] all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
benchmark = ["optimum-benchmark (>=0.3.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"]
codecarbon = ["codecarbon (>=2.8.1)"] codecarbon = ["codecarbon (>=2.8.1)"]
deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"] deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"] dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"] flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
@ -7115,8 +7116,8 @@ tokenizers = ["tokenizers (>=0.21,<0.22)"]
torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"] torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"] torchhub = ["filelock", "huggingface-hub (>=0.26.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
video = ["av (==9.2.0)"] video = ["av"]
vision = ["Pillow (>=10.0.1,<=15.0)"] vision = ["Pillow (>=10.0.1,<=15.0)"]
[[package]] [[package]]
@ -7841,4 +7842,4 @@ vlm = ["transformers", "transformers"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "2cca8bac31dd535e36045cf2f5f0380852c34f6bafad78834144d6ca56d2d79c" content-hash = "63f9271160d39cac74fa3fc959dbb0f91530d76a693c69d81ced006477d04315"

View File

@ -46,7 +46,7 @@ scipy = [
typer = "^0.12.5" typer = "^0.12.5"
python-docx = "^1.1.2" python-docx = "^1.1.2"
python-pptx = "^1.0.2" python-pptx = "^1.0.2"
beautifulsoup4 = ">=4.12.3,<4.13.0" beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4" pandas = "^2.1.4"
marko = "^2.1.2" marko = "^2.1.2"
openpyxl = "^3.1.5" openpyxl = "^3.1.5"
@ -166,7 +166,6 @@ module = [
"ocrmac.*", "ocrmac.*",
"deepsearch_glm.*", "deepsearch_glm.*",
"lxml.*", "lxml.*",
"bs4.*",
"huggingface_hub.*", "huggingface_hub.*",
"transformers.*", "transformers.*",
] ]

View File

@ -410,68 +410,65 @@ item-0 at level 0: unspecified: group _root_
item-396 at level 3: list: group list item-396 at level 3: list: group list
item-397 at level 4: list_item: list of books (useful looking abstracts) item-397 at level 4: list_item: list of books (useful looking abstracts)
item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
item-399 at level 4: list_item: item-399 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
item-400 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl item-400 at level 3: table with [3x2]
item-401 at level 3: table with [3x2] item-401 at level 3: picture
item-402 at level 3: picture item-402 at level 3: list: group list
item-403 at level 3: list: group list item-403 at level 4: list_item: Ducks
item-404 at level 4: list_item: Ducks item-404 at level 4: list_item: Game birds
item-405 at level 4: list_item: Game birds item-405 at level 4: list_item: Bird common names
item-406 at level 4: list_item: Bird common names item-406 at level 3: list: group list
item-407 at level 3: list: group list item-407 at level 4: list_item: All accuracy disputes
item-408 at level 4: list_item: All accuracy disputes item-408 at level 4: list_item: Accuracy disputes from February 2020
item-409 at level 4: list_item: Accuracy disputes from February 2020 item-409 at level 4: list_item: CS1 Finnish-language sources (fi)
item-410 at level 4: list_item: CS1 Finnish-language sources (fi) item-410 at level 4: list_item: CS1 Latvian-language sources (lv)
item-411 at level 4: list_item: CS1 Latvian-language sources (lv) item-411 at level 4: list_item: CS1 Swedish-language sources (sv)
item-412 at level 4: list_item: CS1 Swedish-language sources (sv) item-412 at level 4: list_item: Articles with short description
item-413 at level 4: list_item: Articles with short description item-413 at level 4: list_item: Short description is different from Wikidata
item-414 at level 4: list_item: Short description is different from Wikidata item-414 at level 4: list_item: Wikipedia indefinitely move-protected pages
item-415 at level 4: list_item: Wikipedia indefinitely move-protected pages item-415 at level 4: list_item: Wikipedia indefinitely semi-protected pages
item-416 at level 4: list_item: Wikipedia indefinitely semi-protected pages item-416 at level 4: list_item: Articles with 'species' microformats
item-417 at level 4: list_item: Articles with 'species' microformats item-417 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
item-418 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text item-418 at level 4: list_item: Articles containing Dutch-language text
item-419 at level 4: list_item: Articles containing Dutch-language text item-419 at level 4: list_item: Articles containing German-language text
item-420 at level 4: list_item: Articles containing German-language text item-420 at level 4: list_item: Articles containing Norwegian-language text
item-421 at level 4: list_item: Articles containing Norwegian-language text item-421 at level 4: list_item: Articles containing Lithuanian-language text
item-422 at level 4: list_item: Articles containing Lithuanian-language text item-422 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
item-423 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text item-423 at level 4: list_item: All articles with self-published sources
item-424 at level 4: list_item: All articles with self-published sources item-424 at level 4: list_item: Articles with self-published sources from February 2020
item-425 at level 4: list_item: Articles with self-published sources from February 2020 item-425 at level 4: list_item: All articles with unsourced statements
item-426 at level 4: list_item: All articles with unsourced statements item-426 at level 4: list_item: Articles with unsourced statements from January 2022
item-427 at level 4: list_item: Articles with unsourced statements from January 2022 item-427 at level 4: list_item: CS1: long volume value
item-428 at level 4: list_item: CS1: long volume value item-428 at level 4: list_item: Pages using Sister project links with wikidata mismatch
item-429 at level 4: list_item: Pages using Sister project links with wikidata mismatch item-429 at level 4: list_item: Pages using Sister project links with hidden wikidata
item-430 at level 4: list_item: Pages using Sister project links with hidden wikidata item-430 at level 4: list_item: Webarchive template wayback links
item-431 at level 4: list_item: Webarchive template wayback links item-431 at level 4: list_item: Articles with Project Gutenberg links
item-432 at level 4: list_item: Articles with Project Gutenberg links item-432 at level 4: list_item: Articles containing video clips
item-433 at level 4: list_item: Articles containing video clips item-433 at level 3: list: group list
item-434 at level 3: list: group list item-434 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
item-435 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC). item-435 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
item-436 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization. item-436 at level 3: list: group list
item-437 at level 3: list: group list item-437 at level 4: list_item: Privacy policy
item-438 at level 4: list_item: Privacy policy item-438 at level 4: list_item: About Wikipedia
item-439 at level 4: list_item: About Wikipedia item-439 at level 4: list_item: Disclaimers
item-440 at level 4: list_item: Disclaimers item-440 at level 4: list_item: Contact Wikipedia
item-441 at level 4: list_item: Contact Wikipedia item-441 at level 4: list_item: Code of Conduct
item-442 at level 4: list_item: Code of Conduct item-442 at level 4: list_item: Developers
item-443 at level 4: list_item: Developers item-443 at level 4: list_item: Statistics
item-444 at level 4: list_item: Statistics item-444 at level 4: list_item: Cookie statement
item-445 at level 4: list_item: Cookie statement item-445 at level 4: list_item: Mobile view
item-446 at level 4: list_item: Mobile view item-446 at level 3: list: group list
item-447 at level 3: list: group list item-447 at level 3: list: group list
item-448 at level 4: list_item: item-448 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
item-449 at level 4: list_item: item-449 at level 1: caption: Male mallard.
item-450 at level 3: list: group list item-450 at level 1: caption: Wood ducks.
item-451 at level 1: caption: Pacific black duck displaying the characteristic upending "duck" item-451 at level 1: caption: Mallard landing in approach
item-452 at level 1: caption: Male mallard. item-452 at level 1: caption: Male Mandarin duck
item-453 at level 1: caption: Wood ducks. item-453 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
item-454 at level 1: caption: Mallard landing in approach item-454 at level 1: caption: Female mallard in Cornwall, England
item-455 at level 1: caption: Male Mandarin duck item-455 at level 1: caption: Pecten along the bill
item-456 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina item-456 at level 1: caption: Mallard duckling preening
item-457 at level 1: caption: Female mallard in Cornwall, England item-457 at level 1: caption: A Muscovy duckling
item-458 at level 1: caption: Pecten along the bill item-458 at level 1: caption: Ringed teal
item-459 at level 1: caption: Mallard duckling preening item-459 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
item-460 at level 1: caption: A Muscovy duckling item-460 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
item-461 at level 1: caption: Ringed teal
item-462 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
item-463 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]

View File

@ -1413,9 +1413,6 @@
}, },
{ {
"$ref": "#/texts/350" "$ref": "#/texts/350"
},
{
"$ref": "#/texts/351"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -1428,14 +1425,14 @@
"$ref": "#/texts/341" "$ref": "#/texts/341"
}, },
"children": [ "children": [
{
"$ref": "#/texts/351"
},
{ {
"$ref": "#/texts/352" "$ref": "#/texts/352"
}, },
{ {
"$ref": "#/texts/353" "$ref": "#/texts/353"
},
{
"$ref": "#/texts/354"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -1448,6 +1445,9 @@
"$ref": "#/texts/341" "$ref": "#/texts/341"
}, },
"children": [ "children": [
{
"$ref": "#/texts/354"
},
{ {
"$ref": "#/texts/355" "$ref": "#/texts/355"
}, },
@ -1522,9 +1522,6 @@
}, },
{ {
"$ref": "#/texts/379" "$ref": "#/texts/379"
},
{
"$ref": "#/texts/380"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -1538,10 +1535,10 @@
}, },
"children": [ "children": [
{ {
"$ref": "#/texts/381" "$ref": "#/texts/380"
}, },
{ {
"$ref": "#/texts/382" "$ref": "#/texts/381"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -1554,6 +1551,9 @@
"$ref": "#/texts/341" "$ref": "#/texts/341"
}, },
"children": [ "children": [
{
"$ref": "#/texts/382"
},
{ {
"$ref": "#/texts/383" "$ref": "#/texts/383"
}, },
@ -1577,9 +1577,6 @@
}, },
{ {
"$ref": "#/texts/390" "$ref": "#/texts/390"
},
{
"$ref": "#/texts/391"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -1591,14 +1588,7 @@
"parent": { "parent": {
"$ref": "#/texts/341" "$ref": "#/texts/341"
}, },
"children": [ "children": [],
{
"$ref": "#/texts/392"
},
{
"$ref": "#/texts/393"
}
],
"content_layer": "body", "content_layer": "body",
"name": "list", "name": "list",
"label": "list" "label": "list"
@ -6774,27 +6764,13 @@
"content_layer": "body", "content_layer": "body",
"label": "list_item", "label": "list_item",
"prov": [], "prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/351",
"parent": {
"$ref": "#/groups/42"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl", "orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
"text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl", "text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
"enumerated": false, "enumerated": false,
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/352", "self_ref": "#/texts/351",
"parent": { "parent": {
"$ref": "#/groups/43" "$ref": "#/groups/43"
}, },
@ -6808,7 +6784,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/353", "self_ref": "#/texts/352",
"parent": { "parent": {
"$ref": "#/groups/43" "$ref": "#/groups/43"
}, },
@ -6822,7 +6798,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/354", "self_ref": "#/texts/353",
"parent": { "parent": {
"$ref": "#/groups/43" "$ref": "#/groups/43"
}, },
@ -6836,7 +6812,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/355", "self_ref": "#/texts/354",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6850,7 +6826,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/356", "self_ref": "#/texts/355",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6864,7 +6840,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/357", "self_ref": "#/texts/356",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6878,7 +6854,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/358", "self_ref": "#/texts/357",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6892,7 +6868,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/359", "self_ref": "#/texts/358",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6906,7 +6882,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/360", "self_ref": "#/texts/359",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6920,7 +6896,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/361", "self_ref": "#/texts/360",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6934,7 +6910,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/362", "self_ref": "#/texts/361",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6948,7 +6924,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/363", "self_ref": "#/texts/362",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6962,7 +6938,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/364", "self_ref": "#/texts/363",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6976,7 +6952,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/365", "self_ref": "#/texts/364",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -6990,7 +6966,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/366", "self_ref": "#/texts/365",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7004,7 +6980,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/367", "self_ref": "#/texts/366",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7018,7 +6994,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/368", "self_ref": "#/texts/367",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7032,7 +7008,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/369", "self_ref": "#/texts/368",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7046,7 +7022,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/370", "self_ref": "#/texts/369",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7060,7 +7036,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/371", "self_ref": "#/texts/370",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7074,7 +7050,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/372", "self_ref": "#/texts/371",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7088,7 +7064,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/373", "self_ref": "#/texts/372",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7102,7 +7078,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/374", "self_ref": "#/texts/373",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7116,7 +7092,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/375", "self_ref": "#/texts/374",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7130,7 +7106,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/376", "self_ref": "#/texts/375",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7144,7 +7120,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/377", "self_ref": "#/texts/376",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7158,7 +7134,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/378", "self_ref": "#/texts/377",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7172,7 +7148,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/379", "self_ref": "#/texts/378",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7186,7 +7162,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/380", "self_ref": "#/texts/379",
"parent": { "parent": {
"$ref": "#/groups/44" "$ref": "#/groups/44"
}, },
@ -7200,7 +7176,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/381", "self_ref": "#/texts/380",
"parent": { "parent": {
"$ref": "#/groups/45" "$ref": "#/groups/45"
}, },
@ -7214,7 +7190,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/382", "self_ref": "#/texts/381",
"parent": { "parent": {
"$ref": "#/groups/45" "$ref": "#/groups/45"
}, },
@ -7228,7 +7204,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/383", "self_ref": "#/texts/382",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7242,7 +7218,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/384", "self_ref": "#/texts/383",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7256,7 +7232,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/385", "self_ref": "#/texts/384",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7270,7 +7246,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/386", "self_ref": "#/texts/385",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7284,7 +7260,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/387", "self_ref": "#/texts/386",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7298,7 +7274,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/388", "self_ref": "#/texts/387",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7312,7 +7288,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/389", "self_ref": "#/texts/388",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7326,7 +7302,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/390", "self_ref": "#/texts/389",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7340,7 +7316,7 @@
"marker": "-" "marker": "-"
}, },
{ {
"self_ref": "#/texts/391", "self_ref": "#/texts/390",
"parent": { "parent": {
"$ref": "#/groups/46" "$ref": "#/groups/46"
}, },
@ -7352,34 +7328,6 @@
"text": "Mobile view", "text": "Mobile view",
"enumerated": false, "enumerated": false,
"marker": "-" "marker": "-"
},
{
"self_ref": "#/texts/392",
"parent": {
"$ref": "#/groups/47"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/393",
"parent": {
"$ref": "#/groups/47"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
} }
], ],
"pictures": [ "pictures": [

View File

@ -473,7 +473,6 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a
- list of books (useful looking abstracts) - list of books (useful looking abstracts)
- Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine - Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
-
- Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl - Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl
| Authority control databases | Authority control databases | | Authority control databases | Authority control databases |
@ -526,7 +525,4 @@ additional terms may apply. By using this site, you agree to the Terms of Use an
- Developers - Developers
- Statistics - Statistics
- Cookie statement - Cookie statement
- Mobile view - Mobile view
-
-