From 7f394be9c54b4e6e28cc1e8ebbd399f659e1e404 Mon Sep 17 00:00:00 2001 From: len Date: Fri, 8 Sep 2023 17:13:43 +0200 Subject: [PATCH] [ADD] attachment_indexation_mupdf --- attachment_indexation_mupdf/README.rst | 5 +++ attachment_indexation_mupdf/__init__.py | 4 ++ attachment_indexation_mupdf/__manifest__.py | 17 +++++++++ .../models/__init__.py | 4 ++ .../models/ir_attachment.py | 36 ++++++++++++++++++ attachment_indexation_mupdf/tests/__init__.py | 2 + .../tests/files/test_content.pdf | Bin 0 -> 7497 bytes .../tests/test_indexation.py | 27 +++++++++++++ .../odoo/addons/attachment_indexation_mupdf | 1 + setup/attachment_indexation_mupdf/setup.py | 6 +++ 10 files changed, 102 insertions(+) create mode 100644 attachment_indexation_mupdf/README.rst create mode 100644 attachment_indexation_mupdf/__init__.py create mode 100644 attachment_indexation_mupdf/__manifest__.py create mode 100644 attachment_indexation_mupdf/models/__init__.py create mode 100644 attachment_indexation_mupdf/models/ir_attachment.py create mode 100644 attachment_indexation_mupdf/tests/__init__.py create mode 100644 attachment_indexation_mupdf/tests/files/test_content.pdf create mode 100644 attachment_indexation_mupdf/tests/test_indexation.py create mode 120000 setup/attachment_indexation_mupdf/odoo/addons/attachment_indexation_mupdf create mode 100644 setup/attachment_indexation_mupdf/setup.py diff --git a/attachment_indexation_mupdf/README.rst b/attachment_indexation_mupdf/README.rst new file mode 100644 index 00000000..0529a51d --- /dev/null +++ b/attachment_indexation_mupdf/README.rst @@ -0,0 +1,5 @@ +===================================================== +Attachments List and Document Indexation with PyMuPDF +===================================================== + +Module to index pdf document using state-of-the-art library. diff --git a/attachment_indexation_mupdf/__init__.py b/attachment_indexation_mupdf/__init__.py new file mode 100644 index 00000000..ada0d667 --- /dev/null +++ b/attachment_indexation_mupdf/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +from . import models diff --git a/attachment_indexation_mupdf/__manifest__.py b/attachment_indexation_mupdf/__manifest__.py new file mode 100644 index 00000000..a8986e95 --- /dev/null +++ b/attachment_indexation_mupdf/__manifest__.py @@ -0,0 +1,17 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +{ + "name": "Attachments List and Document Indexation with PyMuPDF", + "category": "Hidden/Tools", + "version": "16.0.0.0.0", + "summary": "Attachments List and Document Indexation with PyMuPDF", + "author": "len-foss/FinancialWay,Odoo Community Association (OCA)", + "website": "https://github.com/OCA/knowledge", + "license": "AGPL-3", + "depends": ["attachment_indexation"], + "auto_install": True, + "installable": True, + "data": [], + "assets": {}, + "external_dependencies": {"python": ["PyMuPDF"]}, +} diff --git a/attachment_indexation_mupdf/models/__init__.py b/attachment_indexation_mupdf/models/__init__.py new file mode 100644 index 00000000..f407ef53 --- /dev/null +++ b/attachment_indexation_mupdf/models/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +from . import ir_attachment diff --git a/attachment_indexation_mupdf/models/ir_attachment.py b/attachment_indexation_mupdf/models/ir_attachment.py new file mode 100644 index 00000000..8d7fe7cc --- /dev/null +++ b/attachment_indexation_mupdf/models/ir_attachment.py @@ -0,0 +1,36 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +import io +import logging + +from odoo import models + +_logger = logging.getLogger(__name__) + +try: + import fitz +except ImportError: + fitz = None + _logger.warning( + "Attachment indexation of PDF documents is unavailable" + "because PyMuPDF cannot be loaded." + ) + + +class IrAttachment(models.Model): + _inherit = "ir.attachment" + + def _index_pdf(self, bin_data): + """Index PDF documents with MuPDF if available""" + if fitz is None: + return super()._index_pdf(bin_data) + buf = "" + try: + f = io.BytesIO(bin_data) + doc = fitz.open(stream=f, filetype="pdf") + for page in doc: + buf += page.get_text() + except Exception: # pylint: disable=except-pass + pass + return buf diff --git a/attachment_indexation_mupdf/tests/__init__.py b/attachment_indexation_mupdf/tests/__init__.py new file mode 100644 index 00000000..377a63ee --- /dev/null +++ b/attachment_indexation_mupdf/tests/__init__.py @@ -0,0 +1,2 @@ +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import test_indexation diff --git a/attachment_indexation_mupdf/tests/files/test_content.pdf b/attachment_indexation_mupdf/tests/files/test_content.pdf new file mode 100644 index 0000000000000000000000000000000000000000..062e1e6e2f630fea4223e0036d98cc9bd80f60fc GIT binary patch literal 7497 zcmai32UJt(wjLExB3^@HB;c?f08*x8r~H}Y z5ax)PkkHoQaBqaa=67Hob47`)c?q+u;^ii{{hze*w2>VAw5{sXSMAabqe`ya#MX8P z+#C~ovzhWryX%1#S2Ilm*5BAULQ(4OM43rj{$2EBTUY6}c8|o-{=fvr%}lYg375)c zr!>mT1HgGY{%M`EqB;zq`bk|tAkx&Yzi86`hz9Y0MDu5~5TFbcj`}^c&nP$I*|GZ{ z9wfQ%U5YOy$0>2+z*pV1t8l01aagOeCfaV0*!^45sgBxLwVw&<06Q@*6Dhu0*)E7n z3=0tg?=mZ%Ova%nk~1r6Tn1M!x+GV*3>}vYO8)Xalia3j7c_D8tABA@Gjnq0`|=~d zIoYjOA9h^lh?@qSG3QUn)r6lVSrC!+8vHg zD;{%xc&sDzn)`moVwHbt<*{?>%nh7$-Pt?hJHy9Ps&nJFSp>iJ@O7&1OT96}2oL)2 zUqs^3`}D2tv&wt8_g^=eLs>R9>Gl#Lbj$Ce*LB~t-jidgs#<>#b;O}G;CSJo)~&0b z<;qx7vNR&kzPIBKpliDGhny5}H`@#n9&m1f6psMeojS4ut$G*S&|77Hiw=K}uh}hD z7Ro$+uR*1BWv6y+ui?NBlRbK*y-p){qJ74!L?1_LJaYvVk*nI}QZgTGj zbGhA3pN{^nlOJh%=Tg%+_`}NAi;<*_W}2z!(%=&zYTO@hp~SpNw--O!lIV(F->P^} z4Q;*q$s;8!H@1M|nU6 z9gmwyrkn5MTd2OSK|XBfP&_S^mtGtmZslDo{I>rf-YxU!Yy&F*>lNY`hWH?Tq;C5# z)+JK9yV44-cql3mVE0z%5u&F7=D%RA@lyLIjo*sqA7r@emu@MYL$46<2GA9?=FRIB=3VLqYh5lHfqn8e%93og=f z>W@r~H5!Z7O>#aa2~`XFkB-E68s)uuF)A5wJVrDYpa6fK*QbX`c2uK}>Jkg?i0wGm zN>0v2YSxt{zsMAXL}g%MzUQO6#8;C^3G~}Pm=&g__S8O$-Smf=m&cqJiW+w>Xv)bM zi1BQNaos1J0%M>y#^b`3u>H?2dg(BqLMQP$kSLOV;T|l9H~s2B0(OIslZr zcJAxL^dE&6hDApVUm9Nj=5h8Kzp3kEziLBtk>B>`eWH!O<~fhM%B)Wg>F*^Fx4Yuc zSv;OI;X0&0XIz6If7^5a1`R41H8SGY{HAjiEvaeqv9Pl$g|b#f&ynVY?6Un>G3`WllxF`-Z#*@b`#iT zbD@;zUe@$%zo}%|M~Au?9cQICD*mI6r?8DrY9?5`I$hhRhPD}p6HknnILo@_e@bN4 zsvS;Wkz(lPTit=Mndn}^8v&V(E6?4^SQQ_TXLIh4RPxY#_1dXPCN!W<6Q1R(a&~v} zX-O%=6ZgA8T^!sdg~xnO@4PQqgiW4H6OjFq7K~NolPxsZ#=&?P6*KN0vomWL9+91! zd_vnc#l$DNFQ~-D756(T_P92V#9*bytt(r0xexCZZK(7w}D$a%ny zH{4n!BNrb{eZH-8a-Ucp#$t6dRQ3Fzsr#J@wiBJ=rhvy@&kY4D4Fk_Su6A4sNO^es zqE;Nu2y8s4SY?ss%f>q|?U?Qm=Z8@1#}77Vv!OWB6Gm}Uas$un*sqU1JC?5)a+$qZ z+66d37B!9)53!vNJ|?ivv5w>QY6%f_gRs6zvPL9tVF=9{Ymj8@Mdh51#Zyl4b0!yM zT1a#kmmsX-4vtGPNpfxG-ud&;9wz2HFkT=#URkTO>h&fHrY*SKsf){{(_ z@ApZ-;4M>*oE&H$iQVzEmkPCBLL+WRy5E?*M42pIX}r?MZM@QXTYm1ykfCK27G8+q z$NFUH)ZKf4Qd-zG`2hHe+k;( z6)S za~eoIj8I@}Wj~uHRY^LSe@y$jW2Ju~{(hxkX)4Vq>sQMQFNE}qgj(D*`z_1fia2`@ zzI`C^eSQTyRM5Qz9v@lB)Q<0K^VMavUl_&C#cPY_jXW2me?3(*-0Y*6amdkcw!Fe! zw?>*tQLL}t5Ep6Z_bs$Z`@%O>yAq3*WF5G^HWyQNR8xEqovSPEzN*}BBT`tbgQ{Qc zeVHmfG3j@r_X+RT)1j#|m_@H5DGAA5yXGxz-Skg6_Qegr1`Rt#;I|)5yZ53+g$L~~ zFrL*tUkldN6Uh*vZz!xSO&YZhntbLPUh#sA63=8yHb{_pyLrbZY)1AgWA*-KTE;T+ zyLOh8euB4Mq18bf&xx_MccZT3*UG{z?RsS=Ji#rdX$;kPA-mdOHQP&`(<)PgWO6`y zm)8DL!Y=C=piG~kW#E(b)OGgA)8WV5tU{3V1x5NcAtxq=}BDPkQ9D);#9M>!bMK=k!!-Z5qD%=<3>I_RC*i zdQE4waf!ZpCcjW;_Le=2JvE*9*tVgs@ovyT<`p;#IH_CslXOzxS;)N(3A7z7iOaMT zCQKMqg$gHwlWs)U=mu~;l&uWS_IjSkL zDCZ`w2XD9l5Pzd6Keo4p{7}CeYp1xywDZA#as5?RU*Z{`<|%=YqeHu+-T5>9HX$pM z-vq?R#rHirj(A3~GCehGkwV%!YOSq%`TK0CpYPfp5e`^Y!F@8{vK zPOO%kQ#;cX|Fksmnks);Vg)%u!m&|hLNc#Y#ndwDe(6TUNA@fxl34w<^R>Ab)$M}8 zj0;_l*j42BXhbGGJ`yi;j42njn#u{sv5^)B?j)LCv?H4pmnVIHVE2La?Zw>v~9pbyMjA;Uszy4rP zNtt8g;E7}gj>36gshyi-!+XI*tPametEN@ zXM3TzCL?`{|H;tS*M{l3P(0en>E_VvK=8pq-0k6y!%3Hq=;QW}LZT*sm7XHUeFPWO+ z=99vE$KNUSC4TVcsr|bCe*f$J$Hxouy-OGAYfgQ?ur7c2vk~@!{DyGow-9_bWvZx| zcdn%XutnbjVwcIRVxb#ro+S#^$HoDn1y`95;V)5D(xWa8!mG&0H9N^pgzWXqgT-WRnUz zaq^UL&AD_y$TUA+Y<{kK%Xx_46C8w7pH~ype!<0Cx5U!D5MRI{`)Q+eqhy?2U-*-z z3}1Zv7qSO3RyyXobB>YTSj?n@a*#h!F!L_k&z86nlsenj_Y%jyYv-s2+ zt`B^A`J#i)#l_V~+l6TfUft-jdwN~j>k*l9qHXGUdR3js_ME3gRP-BX9o~waNwwlk z#`eJ+-ZRlBWLh5%3^;>(qME#*lDS@Td`$*SyjmEeZ0E#4?RP$05@PD1m*u$F{kxWk zD7M>`U6;O*lNeMTHjIl%C{_l{n-+o=J=tiWloP6+5QSWh>bMI@GnY_^78Yr#PHN9Ivel{D2B^tXgnG{jac%6^C`EVok^(?4>tc2tKU&t zJW-x~)dN#nU{Ab|6sKRtxnkJlh~h;%5j6Ot6V!*CB`=742Aa}$voqwItCcX@t0aun zG6~H_gE2p%6V_J3Ij>fwKi%CG49p)a^+Ku2+mGLw@itazpWWD1PVaY?iw=t%OsZ2$ zy4~jBVjLe!5RHg-sQlD~K3k!0KL1U{ivZ?Qx3#y{%e@<&BA=`$g1F3ki%FoW+tn?6 zoJ-cyFZdkjx*&_`&!jw4;kK8h2at(6bYTNL`f&BVz5el)*`|-GCXY<5IeTe?ZZ041 z=G4tI`ff~|;@3}*36y*PmhlQ_%iT8LbIr`hS<>Qa;tZHSCEmE0=xE527qg4tePeVb z0#k5{*7^eRkumSyYyn7xr&)72$N&Djcr^W&UD}cFrK8*|%SO42juVxqJ=70?mW!BI zEOSy-;wp-{svDV)(;d7{&n707F&GxGVeT{EVa{e8Yk#+kI`pJ{OvuE z(>pcSS=+Hv55^ep8{Qhq?BYHAvmho)l;hfg_IZ(>66SI(52hSd#XFl&UCUVb7Q2Jt ziR$(7oBZlm6gvWIfn1JNOFE8{?JNleCh|Af^vh1G=-|&+p9?&^%H%1}cOX%xIqxHq zYb4i7DvEkCTwf;5bM?xbBrP{@&WqV07wJP zaSM$+HCd7GTDvrlj8sr%x2x(#{$RIw;Apz-{zN1(<64<=-tF0isJedP5-|(Ai_3bK zd%ojTdr9H%*svT5Oyy6b%3rs-SC{hJ6>Ir|9|=IIG!#+ka=7FS)xFcGyKNy%&CH6(RG;QNWYlr5ZV|;q1wSI zaZH@9Q3^_b(~-yhyi{;q;#f>S;?2^KUX>E<=PKZJ@lP2^uj!cd#iy;jwm)7D)4nM# ze&seRWAJ6;dd3aQVG`Td2uUpuWwGmPO7&;6_?Cxm5|Ik;#a{Dy2dmten~f!%d6^}` zJ!V1J;||#7W+_ezI@GHV6}i8o_qzQ6TQ%`x_{*Nfi-q?y{#k)(#i417AA~N~xYjJc z;;u=r37mJe%{~NNiaZi*wn{^qhrawg_z_=a%^RV8@7?1c+SA(iwY!8?g_@ac=i*IU z;!sA4ckXre)FXTXuK8io_WC-%dVL6)#*QfBuRkn*xGEFWZ=tX*IH2fqJ$Xxko0K9O zkiMvIr7x?uWv76vez;s>E)YPf+}>;{XYyH?cj{BHzCd~?BA>@ob(O*Y!RLh{eUIlr zS%{?kbWsg&O`un+_k%`qLy*B#Ot*5K4bSdO=mj;Vbb-Qv^}bUAB!i3^MW3V2pL;z{ zdLfZPuABRUX`7~2v!&c_g$_lPgD%`X{+l-g7HMC-th#Mnv_TVTGm1So0W-$)Ef-Wt zd8abCLa+>gr@CX6ihZFbg}PvAkxA-{a_E%uL4Opee(N9R)5VS4%tpS`W?HZgrK z)z7!5JPQf*-)6iN(0-1ScVad}cy8;kpI4EwNW3nkEhLN8eCDg6%IO->MCHa3`-#&< zhsI(yEmDR?H`mYCqkFTt1=sX(PZMX+x-og{aas@|L-ffqR8tVc=ksC9EW^`T#{+YF z(hFHf^q$V&BYbcBW9aaR7AXTv{9n4L)cz>7bt(;mL;ur8MgG-BH4gB?fiQSaqAJeE zf#Bjr#8aBWlvh2h2M(m7OufW)TSA08;jGzaO6LRfGit3aautv8?B3Z5ov z2P=C15Z-W^wfY1jBsm-eJs#Gy9O;iRmYQx~^2$P)a%=GH+SHay)vgDhP&jeD_J0A0 z%wLK84;Da1zV<{aFpLSlxL;HitPhS_jsF`GE(9MU+8IlrmRbk<=Nbh7nYlO;oqep3 z6jUG}U?7}=6BtScNJR+(fdEQ_!M|=83gXynd7V6~6)22?sL3 z5~!esQ*D3@aXxrof&0?E?DYbpn4=NBP|6*N=wUtrKKSd6aI`*%QxyHjQ6IoRyaW8@9!21ne^i&KDc5#!^sxf|ih=ry#`{vU z^qUFDZ!@G{Hqcng&c-`Y^5H}Y|5rKyAbl^Kr?La}kg)ofE|9r}CD7B?-ThxbV2W-X ztf!MG&Qnrd<Ykp1cQ-KFc=|2dBQ1A3S5v-gnAs$ab)i3k{K$@P{ z@Ic6~K_ulx6=(&NgQHnI00R9K@F%xhA literal 0 HcmV?d00001 diff --git a/attachment_indexation_mupdf/tests/test_indexation.py b/attachment_indexation_mupdf/tests/test_indexation.py new file mode 100644 index 00000000..90b61802 --- /dev/null +++ b/attachment_indexation_mupdf/tests/test_indexation.py @@ -0,0 +1,27 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +import os +from unittest import skipIf + +from odoo.tests.common import TransactionCase, tagged + +directory = os.path.dirname(__file__) + +try: + import fitz +except ImportError: + fitz = None + + +@tagged("post_install", "-at_install") +class TestCaseIndexation(TransactionCase): + @skipIf(fitz is None, "PyMyPDF is not installed") + def test_attachment_pdf_indexation(self): + with open(os.path.join(directory, "files", "test_content.pdf"), "rb") as file: + pdf = file.read() + text = self.env["ir.attachment"]._index(pdf, "application/pdf") + # note that the whitespace character is not the same as with pdfminer + self.assertEqual( + text, "TestContent!!\n", "the index content should be correct" + ) diff --git a/setup/attachment_indexation_mupdf/odoo/addons/attachment_indexation_mupdf b/setup/attachment_indexation_mupdf/odoo/addons/attachment_indexation_mupdf new file mode 120000 index 00000000..0ba2d648 --- /dev/null +++ b/setup/attachment_indexation_mupdf/odoo/addons/attachment_indexation_mupdf @@ -0,0 +1 @@ +../../../../attachment_indexation_mupdf \ No newline at end of file diff --git a/setup/attachment_indexation_mupdf/setup.py b/setup/attachment_indexation_mupdf/setup.py new file mode 100644 index 00000000..28c57bb6 --- /dev/null +++ b/setup/attachment_indexation_mupdf/setup.py @@ -0,0 +1,6 @@ +import setuptools + +setuptools.setup( + setup_requires=['setuptools-odoo'], + odoo_addon=True, +)