From 1350a8d3e5ea3c4b4d506757758880c8f78efd8c Mon Sep 17 00:00:00 2001 From: mkrssg <34207463+mkrssg@users.noreply.github.com> Date: Fri, 20 Jun 2025 10:55:30 +0200 Subject: [PATCH] fix(msword_backend): Identify text in the same line after an image #1425 (#1610) * fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau * test: add test file and case for fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau * test: added groundtruth test files for fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau * fix: extraneous empty paragraphs for test files Signed-off-by: Michael Krissgau --------- Signed-off-by: Michael Krissgau Co-authored-by: Michael Krissgau --- docling/backend/msword_backend.py | 8 +- tests/data/docx/word_image_anchors.docx | Bin 0 -> 18560 bytes .../docling_v2/word_image_anchors.docx.itxt | 16 + .../docling_v2/word_image_anchors.docx.json | 286 ++++++++++++++++++ .../docling_v2/word_image_anchors.docx.md | 13 + tests/test_backend_msword.py | 40 +++ 6 files changed, 362 insertions(+), 1 deletion(-) create mode 100644 tests/data/docx/word_image_anchors.docx create mode 100644 tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/word_image_anchors.docx.json create mode 100644 tests/data/groundtruth/docling_v2/word_image_anchors.docx.md diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 44a0f2c..ec071ef 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_tables(element, docx_obj, doc) except Exception: _log.debug("could not parse a table, broken docx table") - + # Check for Image elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + # Check for Text after the Image + if ( + tag_name in ["p"] + and element.find(".//w:t", namespaces=namespaces) is not None + ): + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) diff --git a/tests/data/docx/word_image_anchors.docx b/tests/data/docx/word_image_anchors.docx new file mode 100644 index 0000000000000000000000000000000000000000..c0b030c32e5cb88ebdb913eb1916e23d0c94eb32 GIT binary patch literal 18560 zcmeIab$A^+(mr_1%yG=j%oH;-Gcz+gW;?KGvPFT!+M+)X&{P^TSLUa{mA z)8k;4_-8^Ue+tVV#Fmgo%&i?*QuFWvK^7A zx7UN<8J_XcP7j$!n}9GUfkK3+IW(a`c>j>G_jeG0+`n~H+*r(p^N%YsAN>&Kqoe9N z7+X2g(f&dIw}bv4+$;Zf>t%6W9~~5y@BG{Ax2ZPy6!RRcgGA7=gkD|3(O>CxUe>;`A3}|&F5A(Kw@BPK7-r5-8O9c z{+-F|Fp)UL@V9Vv)UXLm@Ysitl(9~fJxZawy+RmsLW+r5L*k#g-|dwb?o+&(exx$Y z&0C0RdO(zL3%3X1G>>C=L!0ncCo|E+M0c;(X!j}|#WylWT2S0lV_MPEkz~XgltE)& zv`!tl=XMnKg$7bXWkZG1J$ssHA#1!HgzaPSvYr#D*XnNBVr6LaUw^2p|Lh-r!&)e% z5C8xwB>;f*@lafC9gOIVYz>{QKeX8&YV1Tq({`m1*_*fX5y3xe<;Z49BtFpg2;9u| zTm4EO&or_%6jdm&_{!zm6O*7sjZq;-So{nt1RS!t+cP`bxS7cCitgFf=%PHWf{GU0 z3EO<<58>YJ+C?q6w+HNYTsT+)aisVN)Ef)C9|jjAw;k`dF1(@YwPWT4aK7ZdM!F)Z z--L!84AKZC%w&4gxgyz7tY|ibsM(g>Geu@snQ+sij5Wh3Il_w&t2Qz|aT0=~YB?<+ z(Cix_gd0Z>zazV>Qj0 z3Z-7T=jBwGOjpvGQ)}Xo=7+0J5LauJBj`C-U{z|K&9(p_TLokpo6caX6-QvI3^*k^E&-A|@HvDV8T_pXUL82<5gbhycB|&k zPSW+My-2UQE)=#L>q6p$F`)Z#b$J&;)%vTg-j=LPH~Be&Dq?=|!U_H)yi;(w{djDR zXe0tM!V=AGL$!!xC`3~h=@JHBnJ~`LYW?h^D@YfR3-;Ek_MokAGdJ$e6WC4Wx2bTI zS5Hi2fgx@h5+_OSa^Wr&x~{lS^V10Lc(*H4zb3De-(Hk*UF<64*B0i@#8f>cAG++y zR4)!5DkG+Poo#q;1k1N1J(!#1nMr+=P)+C-BW6~|s zm|CsH{v1a3r6$V@L7vpkfPTb4M<=(KmpjLT#t3ev9UF;QXUx|4GO;|RYu+sG>od^s z+Ud+i4p3zl4@XC)U_l~ci)0apx1zEQ;~1L+1^H%#vGNXoYa}8u=jWHR4Z~ta)2hds z_H_;rshF>}g7e7F?kKOim$x<2sMvyi5e802MuI{?`mKZYBq)QQQ7sN`E?mF={J8GN zmHV&7u0TzRq9m_AQc@nMQ4)M3Bb)#n|jz=&kbn%PmB+q=PI0Bh?p{neNO%uc-nuxj{ zG=R#m?j1m4g4$>Ix|Vj}7CgJ#n+)YX08*ikigz`TBuY; zz=R6rVj`Z6)Xugoy%ToF7o#-qiihLx{kr$2FRvIr&;1))_t72q&HV@%J{?}du0F3@ ztBO;H?K_V8eWe8kt%~%c8G#3eavq@${P}5l-M+fwfy^f|47Uzr`RxY9Pp~hAyGnBH z-{cI?`y0@zh^A+8gLnFjlWIth7#_KoI^Gk-Y#{{J4|xU?G&PtJbwKk$EnAinQyheMW8j1-_L+Wx%TuCfR$FE3)Ox z@8Rz8bP$K<_&IQk3lwbmA*kTlTc1l)Z3f_cYxP)e{rKIQ`vG^e8`Skzvm7KhW!EhZ z6`$dbVrx#FkgTlT>kMeac%~0&BQMG-TFmD9Ipzj+m$IK&-uq-UGbbDs#}=0J;%pvc zoKHcuRE4sXxvSDCWLB8!uly}?f)wPF za-4fMFPsUT%5-I3J#RqoM9T4U51wp}shH1u4)IsxNlxUUlS#7cThlJ)lpWV}8Jk0P z{a>{7cs!KGRvaUj5ACZpoDM5T3Rpb0j9eM+;g`s|67}dVnUAaY1Ktxx4;YHhKnT7o z<6Gw6Y(A)p1*}+H^zY~1u{B}?P02=@a^rvS3hxz+={r)dg80f=S{iywa z_R*TqCG+8{37w0*2<~#)BmclxDntvIr3yf>i{qcM2zZKBhhoF1r$5G1Q})*y_m*lMHZH3H$1j01C3L`ceV6FHWkZ>S*eKzDEK zU74?=+*(S~gae-tvdh!P7d*t|>vGxAkuqd7eBtV`76V5JnXSNMfFH*078DCu#-!dY zNLR8?*vGH=!M!=zRU5d*g_ac*!KT>;Bw7~@DrydIfP^ey?hZrb8zeE*meR+a{n};s z&I@~p-J>lQM7P-b>i!w!(vIL|cVX!7A_o0W86f~jwy_dwd^lErB-D@6KSj*i*vMR; z&fHqx)R=+R&c-xUPDT_C8uJenoVb{f!pEo49|;otgW7E;Dt&waIVy+>04m0@4*>w+ zPvS!SN^Ux*D_znc{D?wREW`q^y?n44p7>C4_|QZd=&`;=Hc3m$IM8hKJ5p?8?-iNf zQBOYW=0cu`NE@aE#^~b#3qWFs=q<7%3k=c&_w>gi3Irjcvql9VGDdV(*%JrE=z|7D z5)lNz646^>LlGFE`|0n9K@#*p!eWX3KSKM7ca1xIG|!S2N-rKYco6Ugf=gdce&T(R z9DKcUbK~WCB}tiq&~M;EaS(Y04T|CLS&N5VwhXj-%+9|p!>T1_z5R4<5hA{PF)OZnbzr&x`w2oZS zjB`jYRxU)os`zsK(-!kJI*`|DScN)K?}X*lyqL6z_E# z9pCb4CT6WwB9>}wP;B2ck>Z$2RS*^W(%Hy}O;yVA^TOC{WyHqIH2B3(ZPznmGSut* zV<^{KIS4p@T7VHKw|;dLa4?yJ7ASGM-VfLvEkO-Xy4)Xw8cJ2Bf-K%1OGo+to1qWL z8X>b2IT$9%hp+&|g=K^)1oZs>@bv!@MwTPgce#)3AL+jdBjf)UVT6YMr!Z#vSYeBz z4s_UBXBieM5M^mjh2+t^5+EmhA`t4qgUc88i&;z-BA+BL*C5tPa7-*glrIh`@d5P{ zAd>JzVt|}(lnaxK`zZ+wmeXIb8Iynr5tk&=3)Snl&17T9+0MDa!|dUD_Sov`(ZcD$ zc5ov8NVcBpWy|xtKj6W14H@>%TKjrW+NPpeV5tUtaoW3V;EM)43wr^K$l~8LF87Tb zY9Gp11PB-0EJVBma*{)OK;2q`x2@wkXoRvjV1rOq1QWCo zZ`7+fh=?0!%yzJRL)%&c02LT~4qkf;XaJ}k$&jq%PXJ~?enI5K?iJjq-J~6I=O~HtQxtOJm+ig3bE3s|Q3@RqFor<3z*7g`r}t%@I&MkWDBXtr zfg%)ZCOTtYsbipGyM##$2fPCCuF8OJ_Ca&@tIE~VHNJ)AELEkZqrI_@v)4OPA=X!x!i24P5`e|_xbGk`%Yp#BV)m}47JvZr0 zTe`zZA(jXqW8@(bAy+S+iGgRQSHTKoC<3G7x`kro+y{H}`80~6-q*-PvF#0D14Z=Q z8K@MevYhI*1dI$l5T(rRN5Fk_9 z@Z>|c3Os5?)i-7U)f-|j$jNfxrKsa0Ed+)0q^$xkE{T7)&jKQ5`suC-*6?)2d0cgy{Rl}rX;l|f0TN2QlSvhE zom_Ac91nH#U2L%=Hv-}3_lS)Y&Yd(M;#u5hN_5U)a=9*54lqVbksdxPrGe~q@sszl zJm`|K1ivbpWuzCGJ6=8Rti{bW94K$M z&oNS)#kz0|Kudf9Iys!2?p#p^&Hc*qOhgcsgWxpm!rBsftWo}zyh~eRnk?3{WB|!ua|&y3rq$CYCs3{0xXdS zgN^U(RgD{WZ80S1>%Mk!9G`ENFi{r-BcI6l2=D0;e5ESiCmg|EJ0A2%<~50JY_}2_ zEQbZpE6En1tHCFcve#X%*%az4(aK*BIt%P?o}}Caa)+w%vuFw?Xxo*_$Wnvbp9V#f z<_TJdR`&?!2}Gk(Wx_89|^1fUAH^WXC>j3rp( z>!N#=S4l0vh!>j3F}-vEjn|*I^ve2j(k-MG@YUJBB`dhy4H!NgbSS#OjjbGnB6kZo zp$gPLrfC3C>}q|w^*q|SU97DKl3qv;_EDt3cQ-8*Q5-$*yygq zulj=Zp3)u9xp8|J-k{5=YMrb+Dj>ue61ZJYrxT3zZC~ZeMQR`lFLJ9K23k6pNn1r3 z{z!!nG(lUp12=r#^z0kjw`a%aeufLzd{2#BWzN$&;F_Cy4bQ#UV`%V_gj&SB!Zzh5 z`dZ+craGYGGdk?NX8MHgdE0iwj^z9LfNJ_Oi3M>6e9QC9V2F8Uz4p-fsgz;*i{}ND zpj`QKJZIH%M+P4p!tOc)9KtvCa>4#e#_zoBeyTd*RDC^mv?`^~Yvz*B$LyC2#P#R$ zmhpn_Nd5z*tbBW)LTXB#s=uHCl~}-SErStK@2vJwyj-B63^#8_B38&r2}f5Z9{0k> zxDG7Zb#%Ci?E#cc;sr6qZmmHm!G(k%$w)E2y-Q^K;QQqcpp*>o2YlgJ@tt01dK$<& zP02hlbz~bZZcJ3FZ!cM3Go&&vSW&R}S$70SuHK%SUd{y4qA-9&OjYLQTrhwXu_Bzv zQI+ckBk_4+w=>G^n}p~xv1OT+Ggx#j##)jPm3L$ zqfCtTbo3~352`O;`+Js9eMA=aJpA?)?cJeFRFL1()V@k->7rOVX}<{%JA@22i!GsV zzLxfD3&y(4xqVbP;#HN})#=Xy>62B}&kAQdqzu6J7C>8JZk7HO=TeI9N6otz%Yt>T zh$}4{ICFUQlBXA#oj^ySrUK4ut;3JGC!3yZO5trVbib%XTfOVSM8|t;Q9whP<<0`r zCo2Q%9EWmnm$&e1yLwkR>$aErAZ<|rmv#?>x66y7TDzWy3?@WwYQs?bXH(gBMsl7u!G=vuKa&Yh#owK=7buUAuOHx}5hu#`J1 z5wgXB@KP$B^Vy0 zxKeit6O`HmmUn>}dQXdx{^a)*oLa4(m_cH&!-S+WiE`Pd<0>6kR(kT;fK+VxTUj|o zV?Hnj=J=ns$q8w07K(OKBC-gq?mw-Yt4h8?e&6S=)j~`5?Y`MLi2(y2)oy_>DDv^C z16@T&=>T4f>KYyzT@U!#w`?|4qr3b zv|MKNbdq0rVI1Snk<=1C|89c*wJKbHW12Gk;nob{|IGw-GBdU|ru&mJ{5henE*Xv` zir5Zug%{K`;y(6Ezl>#C?8UbzIXVcw zHY8|5M-;Nq4D|HW{Wy5k$!(dh1X8lu0UzTxoK!oW0Wf2B$tcnmjGNG0BrRf-TWdJo zr@y+-bspR-G+b!wdcn4W=S$nd@M43f!a4q^J+$3=)oEE^yS0>CwzAqvPqf*v#B_rT zBNs2w{1vuQ1W~$HL4YDcr@3XlfyHVm=@vwJkVBOyYb0OR_X~GN5MXvdD2^;jYvSfF z;tPXF5}d)_LL7mAVM4AC97I(&T%{2FezbF#bVvRrT3+T04rXXMLEiupk&H>rjt*G4 z4gA$nUwCr#_YTjS=aGSEU7jcBrn~x8Deg*Ap{znH@gQmt*m^r*5$W9dCQ3 z3qCK8x<~o^$1SFnS-c*%uM0k}FTKF$4o)e-croZO2VZzxY^u>&&C~&SSNHMkA&5hg zPVsU-Kq)X_-0_J(yuAF#Yc|~%Yj<$q54*uznLA7o*P|b_kYO$~K4Eoo_nSrAAAheX zuDeo6At5Tt!S z7{b&=I^86hflH_oj~ydOl*X7nLJZzPZ3Wu35oCRL#?CEbef0b=DAc{gH(P+L*b|XF zsg<)hjNZo_y(-LD6T7N=5vy-U&^1F)Fp3dTjmVmc<*7f}25#PecE{ZuB2AoZWFdG) z*nc(1lMap)W~wqy9D21#WdKbOmXR){Ax?iCP{i{9L=t$`E?Bj7){u3&5)*U+lw-2L z>OxGsIHN}8wc(U4V(DO4@|2pDk4zgz0*y`t) zvQdA@5t$lad0rS+xM3*0fD2x5neW890nsJ395Rtk-s96OY$}kJb2W?3p_xUVLib#D zU3?>ffuT>JhONjTB-I?+otw5@kI%V7Ki~d3#Ng(Al3(xJcs&Q-tQtN{3J9oj5Y3u@ z$kv;b5Dpm)Ys0Ygs9<_icB}kNFY#APuv+=&2fWHVKj@t~1X-HJnPS(I&UmFS!b3(s z+bO!ORak&_46hE+SiG2JZ44SksV9G??UxliZY(ehA3T=Dag-X3Q)09XiWV8wYzy-O zYjFRX(%9}UM{p^avI^8FhWKV&CQx{#J12h@t!7eJva)k;7ynjl^T=G7uLkFjn8Q!?gF0!6(!vnF28 z0%fSxE_hlB_W}C2)*1v05%X~)%EJ)om>c+0z_3zCkoW-3P<}Qq@z2r57%$?SPX&e3 zuLD8`XL87bWe00~@e!JJ^GY5sLPK>_S>N|=RM&^x7`uqn>EoRBX_XB!xe3&q7qP|8 zMRyDjV#Xh7K!dK*m5n3sD`+kq#0~7g=4?m&}A}P0%M@8dbB&PtW1 zwq-&3u!!xFtP1V`yl1CsPut3YhV`|9QibQ$kUryrc5}zV}EaBaObj;!MIAs#S-8J8KTEc4Ni?BdS=;Y@7YFN%M zP%U+CGihMCXhuk`56Y};SKXg$b$R>P{Pdsm&aDKhQ<)!g&ckqjTTym2c5*VeF?IYy z0oJN+IBv5ddY?6oRikl3IYqeSZS5XoDF;v6GUTbbA#- zVU+-6=U{l^w99gGb0&9b|UjzOe1cRle zG*fu3vXtIX4EJ28Q z(nL3>%lJ9YLt8!gqf#HYxngF9PsT`mNYQWY}`QIKw|7< zlw1UHg=ZbISMNw&TsnB6DCY>NJSi;s$4vk}7)Zxul65R!YPW%U4(uYb4tZDc+{mM8 z1#-#{V9eHP`s-S5n&Tzd&tL{I2|vHKY;w%UD*Ly--_`nEo&^@t>T+&ZNrDbt=C()V zf04@&xRAW^Clh`vjtc|irZ&G5_@9IM!kj+BBuwmN_fm+I_%pu}o9>e;NDPA^`G-Ff z2G=~Aj}N*>g0bVvS;COx5k9Q%KEmf>t31eFX^Ov~kqh46;bl|1Dpq_lbq#iaX?nx6 zsd3e4zaWT{^mvGOG*;Gb;B{zYE+Lxx)e=+83zO#W6-d0knyD+`&zu2UkQ_rF5?AB{ zg&~vU*oLdywnH|KztY>hDOL>JdNUz3+DpPq#cOIPo)n4_iDQKSgI$ z<_iKu`{fMhI#l>A**ISJcPB$vHS1XXdSp=N&rd`}g)AxtY$Snws_3K~Jij%sHCa(w?mrAKIbqLhMDBqToqIG;=^!yB z6^bCb%d}KV^Xn+}?lbtRpxU{h;tN^1pVjz5Qo8ParHC-JwVXO=oevF(=Y)`%U3BW# zb{%Itl&GCIHo+)InqO70Y)|BBmMCS@Dcpv)u9vvVSiQeJf?r#Xa{Z9%pf+O*R@v8F z3f0*dek!M~s`-%^sPehKW^kC0W~I9mwd&P{9_mDXWFbSLec(|$yaQ=YJjT6r{Adpn zb-^W&b1c*B$RtyBDw?)vhZmJepZ~ta^GuE?}cYjwqdX_t(5x9p898Tjgrl^;L+~o2Mf*wKX@H zM{|g|D{6UhUD>|Zle0?@3Dtb6tfiu@jdtaIyVyD@Bv~^g{Y(o=MDyZ^TTz>Gy$bOI z(ojm?UNSz$SlDH=L0^R5g`JoH{2)RYa1^A|wnq(PFApi>JB>nF3y!AKKSdZc-mL~q zgV-Np?|F#cZxjNT)QRQ42^;eeJc*t&Iz$d3rVJ9p0z<3>Cw9j*-SNU zPeY#4h={%Ll+6zagFIvXzL)mhF*3GK6lfE$8Q^C0Qx3LL-?wVSW=LG5AaQ%PkdQMXFveq@|@zOc;Db(0_eJXBY3b#4VE2*Ins*H5&S-8k8GiAdtAzVU6@O>-A z1okvkj(FOYfYDAq)At3{j~Z`|s$FS^(K{^}B$iITDTYM81*L(nmX2MJ;5|Pt+*B8V zFtOKKq0=j<4>fM6>#4hu$8MArHkYhPNMTyQ<~S<`l4_=oO{S`j1>e+6-0cb@i~eit zzFK)`rALww?+6Fs!BJhtzEJt3wkzCPXNl>cxD4^OeE8K@86l>Pa@Tc}({d`fL=j_~ zS(ojomfX|Lc4E-$qz7`$jI^iciOSOl*=$Ubhe_%QAH%EIyer$`t;GFq3Gs+&L+2G% z9PJ9QF$1}3oiMg9m+5+@OX;0oVJ>wG!q})SgWMO3gRwXlZ8myuH{Q}pa52YWZ%qo< z9b5S}j%>G|hx|aeZG+rP4?K2s7T7MNyS*KDA*-2YS2CaKygGE2MJ{M#4O1O!<_{Ay zGVMFo8l5j+I*1154~)uJigk|=k1SU_@NUmC%5J~h_9KH|tb1iSoU85BeC9gNi7XMm zrP3_10w^cn<<=Z+IZRfsk!Ur4_FRg?+kH!{x2UuT*Og5P-KX#o5fZ9q-KvKbbv``lSN+KpLXH|6?j`!(V6tMOV zo|ZSL>*F&%{VNtNKj!gqz3ch4E##nDO6bk5{r>gsXf3#{s7&p9mD2u;=XqeVN5}i! zeaGvQlZpx*1sxIC;K)yEo{e>qRjcZ2IE7bi>NMA-Iw3>vUM{P$D*0KB)*WbLZRg)@!2G~0Z02#e67y}P_7(~X0 z4nqDr8D;E*_U!tR_JASeKz3}l)5(t7rXI?8%G4dZ$Y_z}BxS4d;YU*Yr0oz<_-gnn zrG8#*(gC>TOX!C;U@AtvSH(xscSfHp)1TT1;je~CN4&E| zuLXJOgAXWK)y5i$PaE46RqxC1qm@*ts>U}?{ljV@*KHoL6m>p(vQuoS3?xy7Oyk2x zIiJdIdme>#>eKs3R=K8wrPSYk@c0;6?0$Vadp5epFO^n|C1<2sm!_@V8mw=6JH0Wg zeyUuzEM1{AEP7!mo=JI1n6OFcabql=g*fndev-4;xj$|Z7!=4axo*?-uErwcLeUk9 zyI5Uf$O;d5Pq}{a7>d8WT4@*BSS^(Q3eEQIu)3IsJ^(Wd9-%i7q$*GTV@u3{v z=}HCSi&rU#ms=?a#;7H(*lwsqzlo0oE^dTu7%h1XJ;ok z{cD+Jm#gl@sM=Jidvkh;V|ir`>X${^6FuhCs?>_}(-${+12Azi$a+VkCS==PwJE9Z zuA(7G<`14aheuR#Kg~#SlfdX8*fI{Hxe{S;tBK{g1oXN}jkhpjV^U}zBG9;^IgIsY z_!VQX$#qQ>{TH>WAQ(SSplokhh7gPEqfT0>kJX8R^2?c0)9zjrh>IlQ*$8oXmN64Q zHA*SU^gcH!Xxan^`L|BASN1Ek*q|T4D`mrqwlTDK7p9zb{ z<~~P4I?PoS2jX>NO}jurHw1k(D%5Q-RRaKZeTlcp#=PWRUej<6s=(qzR>arB zr~S`CzmjOcU4PLU6Bb^s9Jpejm0B^pywHZDu9Phh@`{=ImL0gN=xQ_WM`}RtTaMw9QL)kZ1hp| z=Pb*-Q{&c10Nm;YiT`Cm8m`j|vXcu4TkB6O_L6a(f->sBdt?jx=#mz==;n@|aNd#l zVrr9-1RDSr{t_fBUkyRRA>)YTFfJCZe()bv0tgJ zCnCT`%mVeCPulDC*v@>A^SNSuAxQv)2&!q`4DXaO?jQew|1IsvB4v%u5~kRM*~cg` z9N-_4rnPP_34AVdBM|VGc?2P1=hv7ygXWmgC5!#)g~A4;kd-R~g|@;HH9W-okWglx zCcH%J&y+(golpFwq5ivF#NK|JWy|Q<0T-5z$VNW(zGEtB>&4BGZRye8kbmDDGk4?a z!g1|Eq+5#Wlu`Sam2(4`J68=>u;-wqGm!eo$07aTB^udYMoUcS4k>%aGg~V{_nrr{ zhD&#nYW=UPGsw8tMh$!JiLLrym;cBJ%1Y+Wz>Uyydm8jj%=59IhGi*79zeCvRFG?6 zG~m`##i%4%6Sys-3S%jR(r&Bi()g|)7GiXG@m@ZhGEiPA(mPx!x*rcm|3Y45dn3OY zjS^4WcPnin<6CD7|D}_9J&|8Fv0|%bcl4fLVOeK4EQwp^(Em!OWo?A<&VDDu<}GyI zx+8ic!{;qjDYeTYnorD|?`m^JTL4s}2t)`YpE}t1N@md?)-J*aA^`I5*FyMUPznF2njq)* zHAapAj7I67Mdck9=(PXuVJv7c5Rjr#BuaX?#7biP z|7bj<;-NN~+^b;;Z*#)k{e1D<=Wv+yDE(R4294z`+CpQ`^!A;$Ta>~_q+FK^JKa4O zPe@9UV2r|I!q_?4(k7Ho`{8A$<{68J)@xHR`NWlAltmH1dt?bUUX1M=*lGzC)|y)( z=1S95a$Q9A+?=F`$#o>rERsAXO=0l`xL@16pMmEh#nsqy02|(3HOd8c3)0otWB?J~ zPL;<6Hkv3iqoF+7BWh;o=uLs|7qn=w-4U5nAjyq#d{j35)0Z*XU&q|M*_BwYpb5Nf z=&gpYkVL zMXSjBqlHF`p<@fJy4k~jkzH4o3JnX#KZ&!O!ySshR%B_?7wvzMmx_L6N$bB6Ty_Ev zhOIveGXEgnsyik5tp7$xDogF>6bk=E;x;~7Egt(f^4phiDwP85BKkp!K~)lR_SIfj zjiYk$A{fPc0!C-}CYB7zTGdvr4V&A#^-bdE*z?QmTOuaYE2z`Kv4r?5s2U72rw&=^ zf|9x6THIiJ^N0;ATz#_TF%f1DPOwJTY27x6uWrZcFt$F9`WUl+EVJ}wBlyFAa~5< za)twM{+Ot8#@(Ylt)LEJ7H{Ee4nX#9F2-3h-c%)b_8^%{+E%@-LF>U)`mwBpmwS;d zx^U^qQ-jxh&QdppqzR3F*-Glw)-_}HT&Maz>}xWf``QxY>VAJ;Hc=$1mDlr{i{Vju zdr|F(qiTqX-Z8~)qp?bP1A(GsD|I3YnS1bQY#CkUlg{mORt%>SS}u993Q!w`82%1;Vk`9QN-IMxIM$a_NyzZSc;__$YfNAgXuT zQpf2c4V=?g^#xrN11=Y_Xmi4pxK4ZW_ol_*dfnJDakl>*g)}bbH5qW^0wXhh=9#|l4k_Ca|ve)DZ^W1^_l}~ zUclBevY#bD_4WRnhBlh65#64dhi()nYfHS!5}O*?RDu!fm^gS)ZcG8~(89rrku>8$ zDw0&TUtC64?AqO<`3x$~32*1~Md66QfmZfcam@1-e}w_8$x-QQFccmYGnQ+8OUsf} zZtN8((KSZ8dj)Zyf@?0_SikCMay8|+9;&Nc@SSH*3w|)9NK2#R3XS`a`E1m3VfgXD zYcuCd=J@^o(a)|iNvkz(9Da8-eObrZAytVTTdhE?d*qZH#w#M+6=)-r4wE6E|Dj|tZN7%y)^k^gP1) zd?fN1@qN^-E!vq<-JNRGOrwc;KkY=N-R$3<=9XO2sLZ7S*ZHZ?j4+axrRJY76cqf% zic5+D#9^^9M!b%wJ!`6yb$P zq=zVHrt9gltrN#7%OP37@2K6zP*R4tNo5fD{E^BdhEFN13e;moyKc}QlQuU1J>I^% z0D8ikYCOYfdcu^Yy{a-i)7pcovYyAfJ}1CZ0U9Aj?XevhTU!rf>e#S%;~j>u4Q+j6 zZ^6s=lxy$aTEQ}qe$=LOMsr^C9&S6bRi;ly?%HDcmuVnS3K)DZPSMJbona;xCV6lS%Tfhy8N z4WHTmncKpO@rwzpZP^jY$@D2js%rooY51I5b@Pk78rM@OyqDl@Np&Z zLwb=uVii6D6yzLi?HuV0Z5{p;+>cn@{|!+17*iwS_$7PjKMkDwZ)ESTlb#ltnK8*N z2}Ag!^l6H%H|xa*5o7Wfr>CIq?QQb zg{yl;6XG?-w%WiZm_gu5FP%ygP@_hEV_Q(O?oa#tM=_<;CJIERtT)Qce*5mUN>s^p7V16hI96z zcdy}|5@O!WQLqT`?DL`D_J%V=tCAm<|K0)GcK5wnCGM2BHhQ+ z{7)S=#`L5!_aTTYA36;2?}DgrXZObd`oD7c@z_3&48<;+6?)h z)Us9pv;4)v>1OC&jXT1-)Kll8Mo12UbCPaqQ+(b(S;>>ZhLI77@$(!{iJhAx8&mbn zaCcC1a$5})=sOPL{f{e?hG8QgQ7t_F+-PM+Uh@Vj6)#n&t5HlBr77}YE6Vi25Nk1X zW6R07yHQcKq{KzfQBpEHb@sR^kS$5q9k|2n!u1P0ljQuRsUD-cD#b6H8<)XH{RsRV zX>?gga1WyHu;m>ibUW5T$z2A=6W|CfKb&;NOK9(Pje4VUCZ_}rGDm^zWpRYkZx72> z1YIj0Ib(HaXKpHo_il3vY&``I(76L1$>KUbe$an(1t4IW57X_xzvSf4-~Dgozr6B9 zPV(Ow{QJun{)Pn + +**This is test 1** 0:08 +Correct, he is not. + + + +**This is test 2** 0:16 +Yeah, exactly. \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index f37b487..61ddd2a 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -9,6 +9,7 @@ from docling.datamodel.document import ( DoclingDocument, InputDocument, SectionHeaderItem, + TextItem, ) from docling.document_converter import DocumentConverter @@ -131,3 +132,42 @@ def test_e2e_docx_conversions(): @pytest.mark.xfail(strict=False) def test_textbox_conversion(): _test_e2e_docx_conversions_impl(docx_paths=[flaky_path]) + + +def test_text_after_image_anchors(): + """ + Test to analyse whether text gets parsed after image anchors. + """ + + in_path = Path("tests/data/docx/word_image_anchors.docx") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_text_after_anchor_1 = found_text_after_anchor_2 = ( + found_text_after_anchor_3 + ) = found_text_after_anchor_4 = False + for item, _ in doc.iterate_items(): + if isinstance(item, TextItem): + if item.text == "This is test 1": + found_text_after_anchor_1 = True + elif item.text == "0:08\nCorrect, he is not.": + found_text_after_anchor_2 = True + elif item.text == "This is test 2": + found_text_after_anchor_3 = True + elif item.text == "0:16\nYeah, exactly.": + found_text_after_anchor_4 = True + + assert ( + found_text_after_anchor_1 + and found_text_after_anchor_2 + and found_text_after_anchor_3 + and found_text_after_anchor_4 + )