From 2a1f8afe7e8d9d508aebcfd3998ee1625c938933 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Fri, 31 Jan 2025 09:54:17 +0100 Subject: [PATCH] fix: use new add_code in html backend and add more typing hints (#850) fix add_code in html backend and add more typing hints Signed-off-by: Michele Dolfi --- docling/backend/html_backend.py | 36 ++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 3de333d..286dfbf 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,9 +1,9 @@ import logging from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Optional, Set, Union -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from docling_core.types.doc import ( DocItemLabel, DoclingDocument, @@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) _log.debug("About to init HTML backend...") - self.soup = None + self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream # Initialise the parents for the hierarchy @@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) return doc - def walk(self, element, doc): + def walk(self, element: Tag, doc: DoclingDocument): try: # Iterate over elements in the body of the document for idx, element in enumerate(element.children): @@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc - def analyse_element(self, element, idx, doc): + def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument): """ if element.name!=None: _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") @@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: self.walk(element, doc) - def get_direct_text(self, item): + def get_direct_text(self, item: Tag): """Get the direct text of the
  • element (ignoring nested lists).""" text = item.find(string=True, recursive=False) if isinstance(text, str): @@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "" # Function to recursively extract text from all child nodes - def extract_text_recursively(self, item): + def extract_text_recursively(self, item: Tag): result = [] if isinstance(item, str): @@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "".join(result) + " " - def handle_header(self, element, idx, doc): + def handle_header(self, element: Tag, idx: int, doc: DoclingDocument): """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) slevel = hlevel - 1 @@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): level=hlevel, ) - def handle_code(self, element, idx, doc): + def handle_code(self, element: Tag, idx: int, doc: DoclingDocument): """Handles monospace code snippets (pre).""" if element.text is None: return @@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): label = DocItemLabel.CODE if len(text) == 0: return - doc.add_code(parent=self.parents[self.level], label=label, text=text) + doc.add_code(parent=self.parents[self.level], text=text) - def handle_paragraph(self, element, idx, doc): + def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument): """Handles paragraph tags (p).""" if element.text is None: return @@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc.add_text(parent=self.parents[self.level], label=label, text=text) - def handle_list(self, element, idx, doc): + def handle_list(self, element: Tag, idx: int, doc: DoclingDocument): """Handles list tags (ul, ol) and their list items.""" if element.name == "ul": @@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - def handle_listitem(self, element, idx, doc): + def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument): """Handles listitem tags (li).""" nested_lists = element.find(["ul", "ol"]) @@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: _log.warn("list-item has no text: ", element) - def handle_table(self, element, idx, doc): + def handle_table(self, element: Tag, idx: int, doc: DoclingDocument): """Handles table tags.""" nested_tables = element.find("table") @@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=data, parent=self.parents[self.level]) - def get_list_text(self, list_element, level=0): + def get_list_text(self, list_element: Tag, level=0): """Recursively extract text from