diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 231d622..a752e8d 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -16,6 +16,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docling_core.types.doc.document import ContentLayer from PIL import Image, UnidentifiedImageError from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER @@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB for shape in slide.shapes: handle_shapes(shape, parent_slide, slide_ind, doc, slide_size) + # Handle notes slide + if slide.has_notes_slide: + notes_slide = slide.notes_slide + notes_text = notes_slide.notes_text_frame.text.strip() + if notes_text: + bbox = BoundingBox(l=0, t=0, r=0, b=0) + prov = ProvenanceItem( + page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox + ) + doc.add_text( + label=DocItemLabel.TEXT, + parent=parent_slide, + text=notes_text, + prov=prov, + content_layer=ContentLayer.FURNITURE, + ) + return doc diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json index b24c46e..fb44156 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json @@ -4,7 +4,7 @@ "name": "powerpoint_sample", "origin": { "mimetype": "application/vnd.ms-powerpoint", - "binary_hash": 1640759611026400292, + "binary_hash": 15572290240354948364, "filename": "powerpoint_sample.pptx" }, "furniture": { @@ -75,6 +75,9 @@ }, { "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" } ], "content_layer": "body", @@ -94,19 +97,22 @@ "$ref": "#/groups/4" }, { - "$ref": "#/texts/15" + "$ref": "#/texts/16" }, { "$ref": "#/groups/5" }, { - "$ref": "#/texts/18" + "$ref": "#/texts/19" }, { "$ref": "#/groups/6" }, { "$ref": "#/groups/7" + }, + { + "$ref": "#/texts/26" } ], "content_layer": "body", @@ -119,14 +125,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/8" - }, { "$ref": "#/texts/9" }, { "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" } ], "content_layer": "body", @@ -139,9 +145,6 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/11" - }, { "$ref": "#/texts/12" }, @@ -150,6 +153,9 @@ }, { "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" } ], "content_layer": "body", @@ -163,10 +169,10 @@ }, "children": [ { - "$ref": "#/texts/16" + "$ref": "#/texts/17" }, { - "$ref": "#/texts/17" + "$ref": "#/texts/18" } ], "content_layer": "body", @@ -179,14 +185,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/19" - }, { "$ref": "#/texts/20" }, { "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" } ], "content_layer": "body", @@ -199,14 +205,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/22" - }, { "$ref": "#/texts/23" }, { "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" } ], "content_layer": "body", @@ -433,6 +439,33 @@ }, { "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 31 + ] + } + ], + "orig": "Some notes on the second slide.", + "text": "Some notes on the second slide." + }, + { + "self_ref": "#/texts/9", "parent": { "$ref": "#/groups/3" }, @@ -461,7 +494,7 @@ "marker": "1." }, { - "self_ref": "#/texts/9", + "self_ref": "#/texts/10", "parent": { "$ref": "#/groups/3" }, @@ -490,7 +523,7 @@ "marker": "2." }, { - "self_ref": "#/texts/10", + "self_ref": "#/texts/11", "parent": { "$ref": "#/groups/3" }, @@ -519,7 +552,7 @@ "marker": "3." }, { - "self_ref": "#/texts/11", + "self_ref": "#/texts/12", "parent": { "$ref": "#/groups/4" }, @@ -548,7 +581,7 @@ "marker": "-" }, { - "self_ref": "#/texts/12", + "self_ref": "#/texts/13", "parent": { "$ref": "#/groups/4" }, @@ -577,7 +610,7 @@ "marker": "-" }, { - "self_ref": "#/texts/13", + "self_ref": "#/texts/14", "parent": { "$ref": "#/groups/4" }, @@ -606,7 +639,7 @@ "marker": "-" }, { - "self_ref": "#/texts/14", + "self_ref": "#/texts/15", "parent": { "$ref": "#/groups/4" }, @@ -635,7 +668,7 @@ "marker": "-" }, { - "self_ref": "#/texts/15", + "self_ref": "#/texts/16", "parent": { "$ref": "#/groups/2" }, @@ -662,7 +695,7 @@ "text": "Some info:" }, { - "self_ref": "#/texts/16", + "self_ref": "#/texts/17", "parent": { "$ref": "#/groups/5" }, @@ -691,7 +724,7 @@ "marker": "-" }, { - "self_ref": "#/texts/17", + "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -720,7 +753,7 @@ "marker": "-" }, { - "self_ref": "#/texts/18", + "self_ref": "#/texts/19", "parent": { "$ref": "#/groups/2" }, @@ -747,7 +780,7 @@ "text": "Maybe a list?" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/20", "parent": { "$ref": "#/groups/6" }, @@ -776,7 +809,7 @@ "marker": "1." }, { - "self_ref": "#/texts/20", + "self_ref": "#/texts/21", "parent": { "$ref": "#/groups/6" }, @@ -805,7 +838,7 @@ "marker": "2." }, { - "self_ref": "#/texts/21", + "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/6" }, @@ -834,7 +867,7 @@ "marker": "3." }, { - "self_ref": "#/texts/22", + "self_ref": "#/texts/23", "parent": { "$ref": "#/groups/7" }, @@ -863,7 +896,7 @@ "marker": "-" }, { - "self_ref": "#/texts/23", + "self_ref": "#/texts/24", "parent": { "$ref": "#/groups/7" }, @@ -892,7 +925,7 @@ "marker": "-" }, { - "self_ref": "#/texts/24", + "self_ref": "#/texts/25", "parent": { "$ref": "#/groups/7" }, @@ -919,6 +952,33 @@ "text": "l3", "enumerated": false, "marker": "-" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 53 + ] + } + ], + "orig": "Final notes on the third slide.\nSecond line of notes.", + "text": "Final notes on the third slide.\nSecond line of notes." } ], "pictures": [], diff --git a/tests/data/pptx/powerpoint_sample.pptx b/tests/data/pptx/powerpoint_sample.pptx index acabf41..0818f28 100644 Binary files a/tests/data/pptx/powerpoint_sample.pptx and b/tests/data/pptx/powerpoint_sample.pptx differ