From 53bf2d179097c78968083bd7bbc1f1fddc897272 Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:00:11 +0100 Subject: [PATCH] Added handling of code blocks in html with
tag (#302) Signed-off-by: Maksym LysakCo-authored-by: Maksym Lysak --- docling/backend/html_backend.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7d14c2e..9cd1e29 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -120,6 +120,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.handle_header(element, idx, doc) elif element.name in ["p"]: self.handle_paragraph(element, idx, doc) + elif element.name in ["pre"]: + self.handle_code(element, idx, doc) elif element.name in ["ul", "ol"]: self.handle_list(element, idx, doc) elif element.name in ["li"]: @@ -205,6 +207,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): level=hlevel, ) + def handle_code(self, element, idx, doc): + """Handles monospace code snippets (pre).""" + if element.text is None: + return + text = element.text.strip() + label = DocItemLabel.CODE + if len(text) == 0: + return + doc.add_text(parent=self.parents[self.level], label=label, text=text) + def handle_paragraph(self, element, idx, doc): """Handles paragraph tags (p).""" if element.text is None: