diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index f232069..d14b422 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -15,6 +15,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docling_core.types.doc.document import ContentLayer from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend @@ -66,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: raise RuntimeError( - f"Could not initialize HTML backend for file with hash {self.document_hash}." + "Could not initialize HTML backend for file with " + f"hash {self.document_hash}." ) from e @override @@ -109,14 +111,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # TODO: remove style to avoid losing text from tags like i, b, span, ... for br in content("br"): br.replace_with(NavigableString("\n")) + + headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"]) + self.content_layer = ( + ContentLayer.BODY if headers is None else ContentLayer.FURNITURE + ) self.walk(content, doc) else: raise RuntimeError( - f"Cannot convert doc with {self.document_hash} because the backend failed to init." + f"Cannot convert doc with {self.document_hash} because the backend " + "failed to init." ) return doc def walk(self, tag: Tag, doc: DoclingDocument) -> None: + # Iterate over elements in the body of the document text: str = "" for element in tag.children: @@ -143,8 +152,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if text and tag.name in ["div"]: doc.add_text( parent=self.parents[self.level], - label=DocItemLabel.PARAGRAPH, + label=DocItemLabel.TEXT, text=text, + content_layer=self.content_layer, ) text = "" @@ -166,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif tag.name == "figure": self.handle_figure(tag, doc) elif tag.name == "img": - self.handle_image(doc) + self.handle_image(tag, doc) else: self.walk(tag, doc) @@ -197,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text = element.text.strip() if hlevel == 1: + self.content_layer = ContentLayer.BODY + for key in self.parents.keys(): self.parents[key] = None self.level = 1 self.parents[self.level] = doc.add_text( - parent=self.parents[0], label=DocItemLabel.TITLE, text=text + parent=self.parents[0], + label=DocItemLabel.TITLE, + text=text, + content_layer=self.content_layer, ) else: if hlevel > self.level: @@ -213,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): name=f"header-{i}", label=GroupLabel.SECTION, parent=self.parents[i - 1], + content_layer=self.content_layer, ) self.level = hlevel @@ -228,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[hlevel - 1], text=text, level=hlevel, + content_layer=self.content_layer, ) def handle_code(self, element: Tag, doc: DoclingDocument) -> None: @@ -236,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return text = element.text.strip() if text: - doc.add_code(parent=self.parents[self.level], text=text) + doc.add_code( + parent=self.parents[self.level], + text=text, + content_layer=self.content_layer, + ) def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: """Handles paragraph tags (p).""" if element.text is None: return text = element.text.strip() - label = DocItemLabel.PARAGRAPH if text: - doc.add_text(parent=self.parents[self.level], label=label, text=text) + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=text, + content_layer=self.content_layer, + ) def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" @@ -253,7 +278,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if element.name == "ul": # create a list group self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], name="list", label=GroupLabel.LIST + parent=self.parents[self.level], + name="list", + label=GroupLabel.LIST, + content_layer=self.content_layer, ) elif element.name == "ol": start_attr = element.get("start") @@ -267,6 +295,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[self.level], name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, + content_layer=self.content_layer, ) self.level += 1 @@ -315,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): enumerated=enumerated, marker=marker, parent=parent, + content_layer=self.content_layer, ) self.level += 1 @@ -336,6 +366,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): enumerated=enumerated, marker=marker, parent=parent, + content_layer=self.content_layer, ) else: _log.debug(f"list-item has no text: {element}") @@ -439,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): table_data = HTMLDocumentBackend.parse_table_data(element) if table_data is not None: - doc.add_table(data=table_data, parent=self.parents[self.level]) + doc.add_table( + data=table_data, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]: """Recursively extract text from