From d727b04ad080df0b3811902059e0fe0539f7037e Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Fri, 31 Jan 2025 14:52:24 +0100 Subject: [PATCH] feat(docx): Support of SDTs in docx backend (#853) Support of table of content containers in docx backend Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak --- docling/backend/msword_backend.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 32e69b9..02f8c86 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -137,6 +137,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): namespaces = { "a": "http://schemas.openxmlformats.org/drawingml/2006/main", "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", } xpath_expr = XPath(".//a:blip", namespaces=namespaces) drawing_blip = xpath_expr(element) @@ -150,6 +151,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elif drawing_blip: self.handle_pictures(element, docx_obj, drawing_blip, doc) + # Check for the sdt containers, like table of contents + elif tag_name in ["sdt"]: + sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) + if sdt_content is not None: + # Iterate paragraphs, runs, or text inside . + paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces) + for p in paragraphs: + self.handle_text_elements(p, docx_obj, doc) # Check for Text elif tag_name in ["p"]: # "tcPr", "sectPr"