fix: HTML backend, fixes for Lists and nested texts (#180)

* Fixes for HTML backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* removed prints

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* cleaning up

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-10-25 20:14:04 +02:00 committed by GitHub
parent 88c1673057
commit 7d19418b77
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def get_direct_text(self, item):
"""Get the direct text of the <li> element (ignoring nested lists)."""
text = item.find(string=True, recursive=False)
if isinstance(text, str):
return text.strip()
@ -149,21 +148,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(item, str):
return [item]
result.append(self.get_direct_text(item))
if item.name not in ["ul", "ol"]:
try:
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
try:
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
return " ".join(result)
return "".join(result) + " "
def handle_header(self, element, idx, doc):
"""Handles header tags (h1, h2, etc.)."""
@ -255,7 +253,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if nested_lists:
name = element.name
text = self.get_direct_text(element)
# Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively
text = self.extract_text_recursively(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
marker = ""
enumerated = False
@ -263,14 +266,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = str(index_in_list)
enumerated = True
# create a list-item
self.parents[self.level + 1] = doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
self.level += 1
if len(text) > 0:
# create a list-item
self.parents[self.level + 1] = doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
self.level += 1
self.walk(element, doc)