
feat(html): Parse text in div elements as TextItem Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
485 lines
17 KiB
Python
485 lines
17 KiB
Python
import logging
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
from typing import Final, Optional, Union, cast
|
|
|
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
|
from bs4.element import PreformattedString
|
|
from docling_core.types.doc import (
|
|
DocItem,
|
|
DocItemLabel,
|
|
DoclingDocument,
|
|
DocumentOrigin,
|
|
GroupItem,
|
|
GroupLabel,
|
|
TableCell,
|
|
TableData,
|
|
)
|
|
from typing_extensions import override
|
|
|
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import InputDocument
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
# tags that generate NodeItem elements
|
|
TAGS_FOR_NODE_ITEMS: Final = [
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"p",
|
|
"pre",
|
|
"ul",
|
|
"ol",
|
|
"li",
|
|
"table",
|
|
"figure",
|
|
"img",
|
|
]
|
|
|
|
|
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
@override
|
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
super().__init__(in_doc, path_or_stream)
|
|
self.soup: Optional[Tag] = None
|
|
# HTML file:
|
|
self.path_or_stream = path_or_stream
|
|
# Initialise the parents for the hierarchy
|
|
self.max_levels = 10
|
|
self.level = 0
|
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
|
for i in range(0, self.max_levels):
|
|
self.parents[i] = None
|
|
|
|
try:
|
|
if isinstance(self.path_or_stream, BytesIO):
|
|
text_stream = self.path_or_stream.getvalue()
|
|
self.soup = BeautifulSoup(text_stream, "html.parser")
|
|
if isinstance(self.path_or_stream, Path):
|
|
with open(self.path_or_stream, "rb") as f:
|
|
html_content = f.read()
|
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
|
) from e
|
|
|
|
@override
|
|
def is_valid(self) -> bool:
|
|
return self.soup is not None
|
|
|
|
@classmethod
|
|
@override
|
|
def supports_pagination(cls) -> bool:
|
|
return False
|
|
|
|
@override
|
|
def unload(self):
|
|
if isinstance(self.path_or_stream, BytesIO):
|
|
self.path_or_stream.close()
|
|
|
|
self.path_or_stream = None
|
|
|
|
@classmethod
|
|
@override
|
|
def supported_formats(cls) -> set[InputFormat]:
|
|
return {InputFormat.HTML}
|
|
|
|
@override
|
|
def convert(self) -> DoclingDocument:
|
|
# access self.path_or_stream to load stuff
|
|
origin = DocumentOrigin(
|
|
filename=self.file.name or "file",
|
|
mimetype="text/html",
|
|
binary_hash=self.document_hash,
|
|
)
|
|
|
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
|
_log.debug("Trying to convert HTML...")
|
|
|
|
if self.is_valid():
|
|
assert self.soup is not None
|
|
content = self.soup.body or self.soup
|
|
# Replace <br> tags with newline characters
|
|
# TODO: remove style to avoid losing text from tags like i, b, span, ...
|
|
for br in content("br"):
|
|
br.replace_with(NavigableString("\n"))
|
|
self.walk(content, doc)
|
|
else:
|
|
raise RuntimeError(
|
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
|
)
|
|
return doc
|
|
|
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
|
# Iterate over elements in the body of the document
|
|
text: str = ""
|
|
for element in tag.children:
|
|
if isinstance(element, Tag):
|
|
try:
|
|
self.analyze_tag(cast(Tag, element), doc)
|
|
except Exception as exc_child:
|
|
_log.error(
|
|
f"Error processing child from tag{tag.name}: {exc_child}"
|
|
)
|
|
raise exc_child
|
|
elif isinstance(element, NavigableString) and not isinstance(
|
|
element, PreformattedString
|
|
):
|
|
# Floating text outside paragraphs or analyzed tags
|
|
text += element
|
|
siblings: list[Tag] = [
|
|
item for item in element.next_siblings if isinstance(item, Tag)
|
|
]
|
|
if element.next_sibling is None or any(
|
|
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
|
|
):
|
|
text = text.strip()
|
|
if text and tag.name in ["div"]:
|
|
doc.add_text(
|
|
parent=self.parents[self.level],
|
|
label=DocItemLabel.PARAGRAPH,
|
|
text=text,
|
|
)
|
|
text = ""
|
|
|
|
return
|
|
|
|
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
|
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
|
self.handle_header(tag, doc)
|
|
elif tag.name in ["p"]:
|
|
self.handle_paragraph(tag, doc)
|
|
elif tag.name in ["pre"]:
|
|
self.handle_code(tag, doc)
|
|
elif tag.name in ["ul", "ol"]:
|
|
self.handle_list(tag, doc)
|
|
elif tag.name in ["li"]:
|
|
self.handle_list_item(tag, doc)
|
|
elif tag.name == "table":
|
|
self.handle_table(tag, doc)
|
|
elif tag.name == "figure":
|
|
self.handle_figure(tag, doc)
|
|
elif tag.name == "img":
|
|
self.handle_image(doc)
|
|
else:
|
|
self.walk(tag, doc)
|
|
|
|
def get_text(self, item: PageElement) -> str:
|
|
"""Get the text content of a tag."""
|
|
parts: list[str] = self.extract_text_recursively(item)
|
|
|
|
return "".join(parts) + " "
|
|
|
|
# Function to recursively extract text from all child nodes
|
|
def extract_text_recursively(self, item: PageElement) -> list[str]:
|
|
result: list[str] = []
|
|
|
|
if isinstance(item, NavigableString):
|
|
return [item]
|
|
|
|
tag = cast(Tag, item)
|
|
if tag.name not in ["ul", "ol"]:
|
|
for child in tag:
|
|
# Recursively get the child's text content
|
|
result.extend(self.extract_text_recursively(child))
|
|
|
|
return ["".join(result) + " "]
|
|
|
|
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
|
"""Handles header tags (h1, h2, etc.)."""
|
|
hlevel = int(element.name.replace("h", ""))
|
|
text = element.text.strip()
|
|
|
|
if hlevel == 1:
|
|
for key in self.parents.keys():
|
|
self.parents[key] = None
|
|
|
|
self.level = 1
|
|
self.parents[self.level] = doc.add_text(
|
|
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
|
)
|
|
else:
|
|
if hlevel > self.level:
|
|
|
|
# add invisible group
|
|
for i in range(self.level + 1, hlevel):
|
|
self.parents[i] = doc.add_group(
|
|
name=f"header-{i}",
|
|
label=GroupLabel.SECTION,
|
|
parent=self.parents[i - 1],
|
|
)
|
|
self.level = hlevel
|
|
|
|
elif hlevel < self.level:
|
|
|
|
# remove the tail
|
|
for key in self.parents.keys():
|
|
if key > hlevel:
|
|
self.parents[key] = None
|
|
self.level = hlevel
|
|
|
|
self.parents[hlevel] = doc.add_heading(
|
|
parent=self.parents[hlevel - 1],
|
|
text=text,
|
|
level=hlevel,
|
|
)
|
|
|
|
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
|
"""Handles monospace code snippets (pre)."""
|
|
if element.text is None:
|
|
return
|
|
text = element.text.strip()
|
|
if text:
|
|
doc.add_code(parent=self.parents[self.level], text=text)
|
|
|
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
|
"""Handles paragraph tags (p)."""
|
|
if element.text is None:
|
|
return
|
|
text = element.text.strip()
|
|
label = DocItemLabel.PARAGRAPH
|
|
if text:
|
|
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
|
|
|
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
|
"""Handles list tags (ul, ol) and their list items."""
|
|
|
|
if element.name == "ul":
|
|
# create a list group
|
|
self.parents[self.level + 1] = doc.add_group(
|
|
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
|
)
|
|
elif element.name == "ol":
|
|
# create a list group
|
|
self.parents[self.level + 1] = doc.add_group(
|
|
parent=self.parents[self.level],
|
|
name="ordered list",
|
|
label=GroupLabel.ORDERED_LIST,
|
|
)
|
|
self.level += 1
|
|
|
|
self.walk(element, doc)
|
|
|
|
self.parents[self.level + 1] = None
|
|
self.level -= 1
|
|
|
|
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
|
|
"""Handles listitem tags (li)."""
|
|
nested_list = element.find(["ul", "ol"])
|
|
|
|
parent = self.parents[self.level]
|
|
if parent is None:
|
|
_log.warning(f"list-item has no parent in DoclingDocument: {element}")
|
|
return
|
|
parent_label: str = parent.label
|
|
index_in_list = len(parent.children) + 1
|
|
|
|
if nested_list:
|
|
# Text in list item can be hidden within hierarchy, hence
|
|
# we need to extract it recursively
|
|
text: str = self.get_text(element)
|
|
# Flatten text, remove break lines:
|
|
text = text.replace("\n", "").replace("\r", "")
|
|
text = " ".join(text.split()).strip()
|
|
|
|
marker = ""
|
|
enumerated = False
|
|
if parent_label == GroupLabel.ORDERED_LIST:
|
|
marker = str(index_in_list)
|
|
enumerated = True
|
|
|
|
if len(text) > 0:
|
|
# create a list-item
|
|
self.parents[self.level + 1] = doc.add_list_item(
|
|
text=text,
|
|
enumerated=enumerated,
|
|
marker=marker,
|
|
parent=parent,
|
|
)
|
|
self.level += 1
|
|
|
|
self.walk(element, doc)
|
|
|
|
self.parents[self.level + 1] = None
|
|
self.level -= 1
|
|
|
|
elif element.text.strip():
|
|
text = element.text.strip()
|
|
|
|
marker = ""
|
|
enumerated = False
|
|
if parent_label == GroupLabel.ORDERED_LIST:
|
|
marker = f"{str(index_in_list)}."
|
|
enumerated = True
|
|
doc.add_list_item(
|
|
text=text,
|
|
enumerated=enumerated,
|
|
marker=marker,
|
|
parent=parent,
|
|
)
|
|
else:
|
|
_log.warning(f"list-item has no text: {element}")
|
|
|
|
@staticmethod
|
|
def parse_table_data(element: Tag) -> Optional[TableData]:
|
|
nested_tables = element.find("table")
|
|
if nested_tables is not None:
|
|
_log.warning("Skipping nested table.")
|
|
return None
|
|
|
|
# Count the number of rows (number of <tr> elements)
|
|
num_rows = len(element("tr"))
|
|
|
|
# Find the number of columns (taking into account colspan)
|
|
num_cols = 0
|
|
for row in element("tr"):
|
|
col_count = 0
|
|
if not isinstance(row, Tag):
|
|
continue
|
|
for cell in row(["td", "th"]):
|
|
if not isinstance(row, Tag):
|
|
continue
|
|
val = cast(Tag, cell).get("colspan", "1")
|
|
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
|
col_count += colspan
|
|
num_cols = max(num_cols, col_count)
|
|
|
|
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
|
|
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
|
|
|
# Iterate over the rows in the table
|
|
for row_idx, row in enumerate(element("tr")):
|
|
if not isinstance(row, Tag):
|
|
continue
|
|
|
|
# For each row, find all the column cells (both <td> and <th>)
|
|
cells = row(["td", "th"])
|
|
|
|
# Check if each cell in the row is a header -> means it is a column header
|
|
col_header = True
|
|
for html_cell in cells:
|
|
if isinstance(html_cell, Tag) and html_cell.name == "td":
|
|
col_header = False
|
|
|
|
# Extract the text content of each cell
|
|
col_idx = 0
|
|
for html_cell in cells:
|
|
if not isinstance(html_cell, Tag):
|
|
continue
|
|
|
|
# extract inline formulas
|
|
for formula in html_cell("inline-formula"):
|
|
math_parts = formula.text.split("$$")
|
|
if len(math_parts) == 3:
|
|
math_formula = f"$${math_parts[1]}$$"
|
|
formula.replace_with(NavigableString(math_formula))
|
|
|
|
# TODO: extract content correctly from table-cells with lists
|
|
text = html_cell.text
|
|
|
|
# label = html_cell.name
|
|
col_val = html_cell.get("colspan", "1")
|
|
col_span = (
|
|
int(col_val)
|
|
if isinstance(col_val, str) and col_val.isnumeric()
|
|
else 1
|
|
)
|
|
row_val = html_cell.get("rowspan", "1")
|
|
row_span = (
|
|
int(row_val)
|
|
if isinstance(row_val, str) and row_val.isnumeric()
|
|
else 1
|
|
)
|
|
|
|
while grid[row_idx][col_idx] is not None:
|
|
col_idx += 1
|
|
for r in range(row_span):
|
|
for c in range(col_span):
|
|
grid[row_idx + r][col_idx + c] = text
|
|
|
|
table_cell = TableCell(
|
|
text=text,
|
|
row_span=row_span,
|
|
col_span=col_span,
|
|
start_row_offset_idx=row_idx,
|
|
end_row_offset_idx=row_idx + row_span,
|
|
start_col_offset_idx=col_idx,
|
|
end_col_offset_idx=col_idx + col_span,
|
|
col_header=col_header,
|
|
row_header=((not col_header) and html_cell.name == "th"),
|
|
)
|
|
data.table_cells.append(table_cell)
|
|
|
|
return data
|
|
|
|
def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
|
|
"""Handles table tags."""
|
|
|
|
table_data = HTMLDocumentBackend.parse_table_data(element)
|
|
|
|
if table_data is not None:
|
|
doc.add_table(data=table_data, parent=self.parents[self.level])
|
|
|
|
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
|
result = []
|
|
bullet_char = "*" # Default bullet character for unordered lists
|
|
|
|
if list_element.name == "ol": # For ordered lists, use numbers
|
|
for i, li in enumerate(list_element("li", recursive=False), 1):
|
|
if not isinstance(li, Tag):
|
|
continue
|
|
# Add numbering for ordered lists
|
|
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
|
|
# Handle nested lists
|
|
nested_list = li.find(["ul", "ol"])
|
|
if isinstance(nested_list, Tag):
|
|
result.extend(self.get_list_text(nested_list, level + 1))
|
|
elif list_element.name == "ul": # For unordered lists, use bullet points
|
|
for li in list_element("li", recursive=False):
|
|
if not isinstance(li, Tag):
|
|
continue
|
|
# Add bullet points for unordered lists
|
|
result.append(
|
|
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
|
|
)
|
|
# Handle nested lists
|
|
nested_list = li.find(["ul", "ol"])
|
|
if isinstance(nested_list, Tag):
|
|
result.extend(self.get_list_text(nested_list, level + 1))
|
|
|
|
return result
|
|
|
|
def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
|
|
"""Handles image tags (img)."""
|
|
|
|
# Extract the image URI from the <img> tag
|
|
# image_uri = root.xpath('//figure//img/@src')[0]
|
|
|
|
contains_captions = element.find(["figcaption"])
|
|
if not isinstance(contains_captions, Tag):
|
|
doc.add_picture(parent=self.parents[self.level], caption=None)
|
|
else:
|
|
texts = []
|
|
for item in contains_captions:
|
|
texts.append(item.text)
|
|
|
|
fig_caption = doc.add_text(
|
|
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
|
)
|
|
doc.add_picture(
|
|
parent=self.parents[self.level],
|
|
caption=fig_caption,
|
|
)
|
|
|
|
def handle_image(self, doc: DoclingDocument) -> None:
|
|
"""Handles image tags (img)."""
|
|
doc.add_picture(parent=self.parents[self.level], caption=None)
|