From 7d19418b779408c345473af684de6b7f60872b6e Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Fri, 25 Oct 2024 20:14:04 +0200 Subject: [PATCH] fix: HTML backend, fixes for Lists and nested texts (#180) * Fixes for HTML backend Signed-off-by: Maksym Lysak * removed prints Signed-off-by: Maksym Lysak * cleaning up Signed-off-by: Maksym Lysak --------- Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak --- docling/backend/html_backend.py | 52 ++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7bae346..b802605 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def get_direct_text(self, item): """Get the direct text of the
  • element (ignoring nested lists).""" text = item.find(string=True, recursive=False) - if isinstance(text, str): return text.strip() @@ -149,21 +148,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if isinstance(item, str): return [item] - result.append(self.get_direct_text(item)) + if item.name not in ["ul", "ol"]: + try: + # Iterate over the children (and their text and tails) + for child in item: + try: + # Recursively get the child's text content + result.extend(self.extract_text_recursively(child)) + except: + pass + except: + _log.warn("item has no children") + pass - try: - # Iterate over the children (and their text and tails) - for child in item: - try: - # Recursively get the child's text content - result.extend(self.extract_text_recursively(child)) - except: - pass - except: - _log.warn("item has no children") - pass - - return " ".join(result) + return "".join(result) + " " def handle_header(self, element, idx, doc): """Handles header tags (h1, h2, etc.).""" @@ -255,7 +253,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if nested_lists: name = element.name - text = self.get_direct_text(element) + # Text in list item can be hidden within hierarchy, hence + # we need to extract it recursively + text = self.extract_text_recursively(element) + # Flatten text, remove break lines: + text = text.replace("\n", "").replace("\r", "") + text = " ".join(text.split()).strip() marker = "" enumerated = False @@ -263,14 +266,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): marker = str(index_in_list) enumerated = True - # create a list-item - self.parents[self.level + 1] = doc.add_list_item( - text=text, - enumerated=enumerated, - marker=marker, - parent=self.parents[self.level], - ) - self.level += 1 + if len(text) > 0: + # create a list-item + self.parents[self.level + 1] = doc.add_list_item( + text=text, + enumerated=enumerated, + marker=marker, + parent=self.parents[self.level], + ) + self.level += 1 self.walk(element, doc)