fix: use new add_code in html backend and add more typing hints (#850)
fix add_code in html backend and add more typing hints Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
4df085aa6c
commit
2a1f8afe7e
@ -1,9 +1,9 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
from typing import Optional, Set, Union
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
_log.debug("About to init HTML backend...")
|
||||
self.soup = None
|
||||
self.soup: Optional[Tag] = None
|
||||
# HTML file:
|
||||
self.path_or_stream = path_or_stream
|
||||
# Initialise the parents for the hierarchy
|
||||
@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return doc
|
||||
|
||||
def walk(self, element, doc):
|
||||
def walk(self, element: Tag, doc: DoclingDocument):
|
||||
try:
|
||||
# Iterate over elements in the body of the document
|
||||
for idx, element in enumerate(element.children):
|
||||
@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return doc
|
||||
|
||||
def analyse_element(self, element, idx, doc):
|
||||
def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""
|
||||
if element.name!=None:
|
||||
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||
@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
self.walk(element, doc)
|
||||
|
||||
def get_direct_text(self, item):
|
||||
def get_direct_text(self, item: Tag):
|
||||
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
||||
text = item.find(string=True, recursive=False)
|
||||
if isinstance(text, str):
|
||||
@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return ""
|
||||
|
||||
# Function to recursively extract text from all child nodes
|
||||
def extract_text_recursively(self, item):
|
||||
def extract_text_recursively(self, item: Tag):
|
||||
result = []
|
||||
|
||||
if isinstance(item, str):
|
||||
@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return "".join(result) + " "
|
||||
|
||||
def handle_header(self, element, idx, doc):
|
||||
def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""Handles header tags (h1, h2, etc.)."""
|
||||
hlevel = int(element.name.replace("h", ""))
|
||||
slevel = hlevel - 1
|
||||
@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
level=hlevel,
|
||||
)
|
||||
|
||||
def handle_code(self, element, idx, doc):
|
||||
def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""Handles monospace code snippets (pre)."""
|
||||
if element.text is None:
|
||||
return
|
||||
@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
label = DocItemLabel.CODE
|
||||
if len(text) == 0:
|
||||
return
|
||||
doc.add_code(parent=self.parents[self.level], label=label, text=text)
|
||||
doc.add_code(parent=self.parents[self.level], text=text)
|
||||
|
||||
def handle_paragraph(self, element, idx, doc):
|
||||
def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""Handles paragraph tags (p)."""
|
||||
if element.text is None:
|
||||
return
|
||||
@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
|
||||
def handle_list(self, element, idx, doc):
|
||||
def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""Handles list tags (ul, ol) and their list items."""
|
||||
|
||||
if element.name == "ul":
|
||||
@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
|
||||
def handle_listitem(self, element, idx, doc):
|
||||
def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""Handles listitem tags (li)."""
|
||||
nested_lists = element.find(["ul", "ol"])
|
||||
|
||||
@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
_log.warn("list-item has no text: ", element)
|
||||
|
||||
def handle_table(self, element, idx, doc):
|
||||
def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""Handles table tags."""
|
||||
|
||||
nested_tables = element.find("table")
|
||||
@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
doc.add_table(data=data, parent=self.parents[self.level])
|
||||
|
||||
def get_list_text(self, list_element, level=0):
|
||||
def get_list_text(self, list_element: Tag, level=0):
|
||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||
result = []
|
||||
bullet_char = "*" # Default bullet character for unordered lists
|
||||
@ -403,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return result
|
||||
|
||||
def extract_table_cell_text(self, cell):
|
||||
def extract_table_cell_text(self, cell: Tag):
|
||||
"""Extract text from a table cell, including lists with indents."""
|
||||
contains_lists = cell.find(["ul", "ol"])
|
||||
if contains_lists is None:
|
||||
@ -414,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return cell.text
|
||||
|
||||
def handle_figure(self, element, idx, doc):
|
||||
def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""Handles image tags (img)."""
|
||||
|
||||
# Extract the image URI from the <img> tag
|
||||
@ -437,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
caption=fig_caption,
|
||||
)
|
||||
|
||||
def handle_image(self, element, idx, doc):
|
||||
def handle_image(self, element: Tag, idx, doc: DoclingDocument):
|
||||
"""Handles image tags (img)."""
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
|
Loading…
Reference in New Issue
Block a user