diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 44a0f2c..ec071ef 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_tables(element, docx_obj, doc) except Exception: _log.debug("could not parse a table, broken docx table") - + # Check for Image elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + # Check for Text after the Image + if ( + tag_name in ["p"] + and element.find(".//w:t", namespaces=namespaces) is not None + ): + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) diff --git a/tests/data/docx/word_image_anchors.docx b/tests/data/docx/word_image_anchors.docx new file mode 100644 index 0000000..c0b030c Binary files /dev/null and b/tests/data/docx/word_image_anchors.docx differ diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt new file mode 100644 index 0000000..ebc5ceb --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt @@ -0,0 +1,16 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: Transcript + item-2 at level 1: paragraph: February 20, 2025, 8:32PM + item-3 at level 1: picture + item-4 at level 1: inline: group group + item-5 at level 2: paragraph: This is test 1 + item-6 at level 2: paragraph: 0:08 +Correct, he is not. + item-7 at level 1: paragraph: + item-8 at level 1: picture + item-9 at level 1: inline: group group + item-10 at level 2: paragraph: This is test 2 + item-11 at level 2: paragraph: 0:16 +Yeah, exactly. + item-12 at level 1: paragraph: + item-13 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json new file mode 100644 index 0000000..b5433eb --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json @@ -0,0 +1,286 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "word_image_anchors", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 2428692234257307633, + "filename": "word_image_anchors.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/pictures/0" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/pictures/1" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Transcript", + "text": "Transcript", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "February 20, 2025, 8:32PM", + "text": "February 20, 2025, 8:32PM", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is test 1", + "text": "This is test 1", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "0:08\nCorrect, he is not.", + "text": "0:08\nCorrect, he is not.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is test 2", + "text": "This is test 2", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "0:16\nYeah, exactly.", + "text": "0:16\nYeah, exactly.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 100.0, + "height": 100.0 + }, + "uri": "" + }, + "annotations": [] + }, + { + "self_ref": "#/pictures/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 100.0, + "height": 100.0 + }, + "uri": "" + }, + "annotations": [] + } + ], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md new file mode 100644 index 0000000..a3ce2fc --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md @@ -0,0 +1,13 @@ +**Transcript** + +February 20, 2025, 8:32PM + + + +**This is test 1** 0:08 +Correct, he is not. + + + +**This is test 2** 0:16 +Yeah, exactly. \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index f37b487..61ddd2a 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -9,6 +9,7 @@ from docling.datamodel.document import ( DoclingDocument, InputDocument, SectionHeaderItem, + TextItem, ) from docling.document_converter import DocumentConverter @@ -131,3 +132,42 @@ def test_e2e_docx_conversions(): @pytest.mark.xfail(strict=False) def test_textbox_conversion(): _test_e2e_docx_conversions_impl(docx_paths=[flaky_path]) + + +def test_text_after_image_anchors(): + """ + Test to analyse whether text gets parsed after image anchors. + """ + + in_path = Path("tests/data/docx/word_image_anchors.docx") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_text_after_anchor_1 = found_text_after_anchor_2 = ( + found_text_after_anchor_3 + ) = found_text_after_anchor_4 = False + for item, _ in doc.iterate_items(): + if isinstance(item, TextItem): + if item.text == "This is test 1": + found_text_after_anchor_1 = True + elif item.text == "0:08\nCorrect, he is not.": + found_text_after_anchor_2 = True + elif item.text == "This is test 2": + found_text_after_anchor_3 = True + elif item.text == "0:16\nYeah, exactly.": + found_text_after_anchor_4 = True + + assert ( + found_text_after_anchor_1 + and found_text_after_anchor_2 + and found_text_after_anchor_3 + and found_text_after_anchor_4 + )