From 1705cefe6b31b45819ab492e9b477bf87d0f2a74 Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Tue, 30 May 2017 15:54:45 +0100 Subject: [PATCH 01/16] [MIG] Migrate document-ocr to v10.0 --- document_ocr/README.rst | 86 ++++++++++++++++++++++ document_ocr/__init__.py | 4 + document_ocr/__manifest__.py | 23 ++++++ document_ocr/data/ir_config_parameter.xml | 13 ++++ document_ocr/data/ir_cron.xml | 13 ++++ document_ocr/models/__init__.py | 4 + document_ocr/models/ir_attachment.py | 85 +++++++++++++++++++++ document_ocr/static/description/icon.png | Bin 0 -> 9455 bytes document_ocr/tests/__init__.py | 4 + document_ocr/tests/test_document_ocr.py | 49 ++++++++++++ 10 files changed, 281 insertions(+) create mode 100644 document_ocr/README.rst create mode 100644 document_ocr/__init__.py create mode 100644 document_ocr/__manifest__.py create mode 100644 document_ocr/data/ir_config_parameter.xml create mode 100644 document_ocr/data/ir_cron.xml create mode 100644 document_ocr/models/__init__.py create mode 100644 document_ocr/models/ir_attachment.py create mode 100644 document_ocr/static/description/icon.png create mode 100644 document_ocr/tests/__init__.py create mode 100644 document_ocr/tests/test_document_ocr.py diff --git a/document_ocr/README.rst b/document_ocr/README.rst new file mode 100644 index 00000000..7f9c3b28 --- /dev/null +++ b/document_ocr/README.rst @@ -0,0 +1,86 @@ +.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg + :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html + :alt: License: AGPL-3 + +================= +OCR for documents +================= + +This module was written to make uploaded documents, for example scans, searchable by running OCR on them. + +It supports all image formats `Pillow supports `_ for reading and PDFs. + +Installation +============ + +To install this module, you need to: + +#. install tesseract and the language(s) your documents use +#. if you want to support OCR on PDFs, install imagemagick +#. install the module itself + +On an Debian or Ubuntu system you would typically run:: + + $ sudo apt-get install tesseract-ocr imagemagick + + +Configuration +============= + +To configure this module, go to: + +#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.* + +Usage +===== + +By default, character recognition is done asynchronously by a cronjob at night. +This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish. +The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. +In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. + +.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas + :alt: Try me on Runbot + :target: https://runbot.odoo-community.org/runbot/118/10.0 + +Bug Tracker +=========== + +Bugs are tracked on `GitHub Issues `_. +In case of trouble, please check there if your issue has already been reported. +If you spotted it first, help us smashing it by providing a detailed and welcomed feedback. + +Credits +======= + +The actual work +--------------- + +* `tesseract `_ + +Images +------ + +* Odoo Community Association: `Icon `_. + +Contributors +------------ + +* Holger Brunn + +Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list `_ or the `appropriate specialized mailinglist `_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues. + +Maintainer +---------- + +.. image:: https://odoo-community.org/logo.png + :alt: Odoo Community Association + :target: https://odoo-community.org + +This module is maintained by the OCA. + +OCA, or the Odoo Community Association, is a nonprofit organization whose +mission is to support the collaborative development of Odoo features and +promote its widespread use. + +To contribute to this module, please visit https://odoo-community.org. diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py new file mode 100644 index 00000000..7eda98a2 --- /dev/null +++ b/document_ocr/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import models diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py new file mode 100644 index 00000000..382e77d6 --- /dev/null +++ b/document_ocr/__manifest__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +{ + "name": "OCR for documents", + "version": "10.0.1.0.0", + "author": "Therp BV,Odoo Community Association (OCA), TKO Brasil", + "license": "AGPL-3", + "category": "Knowledge Management", + "summary": "Run character recognition on uploaded files", + "depends": [ + 'document', + ], + "data": [ + "data/ir_cron.xml", + "data/ir_config_parameter.xml", + ], + "external_dependencies": { + 'bin': [ + 'tesseract', + ], + }, +} diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml new file mode 100644 index 00000000..e46db18a --- /dev/null +++ b/document_ocr/data/ir_config_parameter.xml @@ -0,0 +1,13 @@ + + + + + document_ocr.synchronous + False + + + document_ocr.dpi + 300 + + + diff --git a/document_ocr/data/ir_cron.xml b/document_ocr/data/ir_cron.xml new file mode 100644 index 00000000..f69d151a --- /dev/null +++ b/document_ocr/data/ir_cron.xml @@ -0,0 +1,13 @@ + + + + + Run OCR on uploaded documents + days + 1 + ir.attachment + _ocr_cron + -1 + + + diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py new file mode 100644 index 00000000..a15f1b21 --- /dev/null +++ b/document_ocr/models/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import ir_attachment diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py new file mode 100644 index 00000000..b27992c8 --- /dev/null +++ b/document_ocr/models/ir_attachment.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +import logging +import subprocess +from StringIO import StringIO + +from PIL import Image +from openerp import api, models + +_logger = logging.getLogger(__name__) +_MARKER_PHRASE = '[[waiting for OCR]]' + + +class IrAttachment(models.Model): + _inherit = 'ir.attachment' + + @api.model + def _index(self, data, datas_fname, file_type): + mimetype, content = super(IrAttachment, self)._index( + data, datas_fname, file_type) + if not content or content == 'image': + has_synchr_param = self.env['ir.config_parameter'].get_param( + 'document_ocr.synchronous', 'False') == 'True' + has_force_flag = self.env.context.get('document_ocr_force') + if has_synchr_param or has_force_flag: + content = self._index_ocr(mimetype, data, datas_fname, + file_type) + else: + content = _MARKER_PHRASE + + return mimetype, content + + @api.model + def _index_ocr(self, mimetype, data, datas_fname, file_type): + dpi = int( + self.env['ir.config_parameter'].get_param( + 'document_ocr.dpi', '500')) + top_type, sub_type = mimetype.split('/', 1) + if hasattr(self, '_index_ocr_get_data_%s' % sub_type): + image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)( + data, datas_fname, file_type, dpi) + else: + image_data = StringIO() + try: + Image.open(StringIO(data)).save(image_data, 'tiff', + dpi=(dpi, dpi)) + except IOError: + _logger.exception('Failed to OCR image') + return None + process = subprocess.Popen( + ['tesseract', 'stdin', 'stdout'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(image_data.getvalue()) + if stderr: + _logger.error('Error during OCR: %s', stderr) + return stdout + + @api.model + def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi): + process = subprocess.Popen( + ['convert', '-density', str(dpi), '-', '-append', 'png32:-'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(data) + if stderr: + _logger.error('Error converting to PDF: %s', stderr) + return StringIO(stdout) + + @api.model + def _ocr_cron(self): + for this in self.with_context(document_ocr_force=True).search([ + ('index_content', '=', _MARKER_PHRASE), + ]): + if not this.datas: + continue + file_type, index_content = this._index( + this.datas.decode('base64'), this.datas_fname, this.file_type) + this.write({ + 'file_type': file_type, + 'index_content': index_content, + }) diff --git a/document_ocr/static/description/icon.png b/document_ocr/static/description/icon.png new file mode 100644 index 0000000000000000000000000000000000000000..3a0328b516c4980e8e44cdb63fd945757ddd132d GIT binary patch literal 9455 zcmW++2RxMjAAjx~&dlBk9S+%}OXg)AGE&Cb*&}d0jUxM@u(PQx^-s)697TX`ehR4?GS^qbkof1cslKgkU)h65qZ9Oc=ml_0temigYLJfnz{IDzUf>bGs4N!v3=Z3jMq&A#7%rM5eQ#dc?k~! zVpnB`o+K7|Al`Q_U;eD$B zfJtP*jH`siUq~{KE)`jP2|#TUEFGRryE2`i0**z#*^6~AI|YzIWy$Cu#CSLW3q=GA z6`?GZymC;dCPk~rBS%eCb`5OLr;RUZ;D`}um=H)BfVIq%7VhiMr)_#G0N#zrNH|__ zc+blN2UAB0=617@>_u;MPHN;P;N#YoE=)R#i$k_`UAA>WWCcEVMh~L_ zj--gtp&|K1#58Yz*AHCTMziU1Jzt_jG0I@qAOHsk$2}yTmVkBp_eHuY$A9)>P6o~I z%aQ?!(GqeQ-Y+b0I(m9pwgi(IIZZzsbMv+9w{PFtd_<_(LA~0H(xz{=FhLB@(1&qHA5EJw1>>=%q2f&^X>IQ{!GJ4e9U z&KlB)z(84HmNgm2hg2C0>WM{E(DdPr+EeU_N@57;PC2&DmGFW_9kP&%?X4}+xWi)( z;)z%wI5>D4a*5XwD)P--sPkoY(a~WBw;E~AW`Yue4kFa^LM3X`8x|}ZUeMnqr}>kH zG%WWW>3ml$Yez?i%)2pbKPI7?5o?hydokgQyZsNEr{a|mLdt;X2TX(#B1j35xPnPW z*bMSSOauW>o;*=kO8ojw91VX!qoOQb)zHJ!odWB}d+*K?#sY_jqPdg{Sm2HdYzdEx zOGVPhVRTGPtv0o}RfVP;Nd(|CB)I;*t&QO8h zFfekr30S!-LHmV_Su-W+rEwYXJ^;6&3|L$mMC8*bQptyOo9;>Qb9Q9`ySe3%V$A*9 zeKEe+b0{#KWGp$F+tga)0RtI)nhMa-K@JS}2krK~n8vJ=Ngm?R!9G<~RyuU0d?nz# z-5EK$o(!F?hmX*2Yt6+coY`6jGbb7tF#6nHA zuKk=GGJ;ZwON1iAfG$E#Y7MnZVmrY|j0eVI(DN_MNFJmyZ|;w4tf@=CCDZ#5N_0K= z$;R~bbk?}TpfDjfB&aiQ$VA}s?P}xPERJG{kxk5~R`iRS(SK5d+Xs9swCozZISbnS zk!)I0>t=A<-^z(cmSFz3=jZ23u13X><0b)P)^1T_))Kr`e!-pb#q&J*Q`p+B6la%C zuVl&0duN<;uOsB3%T9Fp8t{ED108<+W(nOZd?gDnfNBC3>M8WE61$So|P zVvqH0SNtDTcsUdzaMDpT=Ty0pDHHNL@Z0w$Y`XO z2M-_r1S+GaH%pz#Uy0*w$Vdl=X=rQXEzO}d6J^R6zjM1u&c9vYLvLp?W7w(?np9x1 zE_0JSAJCPB%i7p*Wvg)pn5T`8k3-uR?*NT|J`eS#_#54p>!p(mLDvmc-3o0mX*mp_ zN*AeS<>#^-{S%W<*mz^!X$w_2dHWpcJ6^j64qFBft-o}o_Vx80o0>}Du;>kLts;$8 zC`7q$QI(dKYG`Wa8#wl@V4jVWBRGQ@1dr-hstpQL)Tl+aqVpGpbSfN>5i&QMXfiZ> zaA?T1VGe?rpQ@;+pkrVdd{klI&jVS@I5_iz!=UMpTsa~mBga?1r}aRBm1WS;TT*s0f0lY=JBl66Upy)-k4J}lh=P^8(SXk~0xW=T9v*B|gzIhN z>qsO7dFd~mgxAy4V?&)=5ieYq?zi?ZEoj)&2o)RLy=@hbCRcfT5jigwtQGE{L*8<@Yd{zg;CsL5mvzfDY}P-wos_6PfprFVaeqNE%h zKZhLtcQld;ZD+>=nqN~>GvROfueSzJD&BE*}XfU|H&(FssBqY=hPCt`d zH?@s2>I(|;fcW&YM6#V#!kUIP8$Nkdh0A(bEVj``-AAyYgwY~jB zT|I7Bf@%;7aL7Wf4dZ%VqF$eiaC38OV6oy3Z#TER2G+fOCd9Iaoy6aLYbPTN{XRPz z;U!V|vBf%H!}52L2gH_+j;`bTcQRXB+y9onc^wLm5wi3-Be}U>k_u>2Eg$=k!(l@I zcCg+flakT2Nej3i0yn+g+}%NYb?ta;R?(g5SnwsQ49U8Wng8d|{B+lyRcEDvR3+`O{zfmrmvFrL6acVP%yG98X zo&+VBg@px@i)%o?dG(`T;n*$S5*rnyiR#=wW}}GsAcfyQpE|>a{=$Hjg=-*_K;UtD z#z-)AXwSRY?OPefw^iI+ z)AXz#PfEjlwTes|_{sB?4(O@fg0AJ^g8gP}ex9Ucf*@_^J(s_5jJV}c)s$`Myn|Kd z$6>}#q^n{4vN@+Os$m7KV+`}c%4)4pv@06af4-x5#wj!KKb%caK{A&Y#Rfs z-po?Dcb1({W=6FKIUirH&(yg=*6aLCekcKwyfK^JN5{wcA3nhO(o}SK#!CINhI`-I z1)6&n7O&ZmyFMuNwvEic#IiOAwNkR=u5it{B9n2sAJV5pNhar=j5`*N!Na;c7g!l$ z3aYBqUkqqTJ=Re-;)s!EOeij=7SQZ3Hq}ZRds%IM*PtM$wV z@;rlc*NRK7i3y5BETSKuumEN`Xu_8GP1Ri=OKQ$@I^ko8>H6)4rjiG5{VBM>B|%`&&s^)jS|-_95&yc=GqjNo{zFkw%%HHhS~e=s zD#sfS+-?*t|J!+ozP6KvtOl!R)@@-z24}`9{QaVLD^9VCSR2b`b!KC#o;Ki<+wXB6 zx3&O0LOWcg4&rv4QG0)4yb}7BFSEg~=IR5#ZRj8kg}dS7_V&^%#Do==#`u zpy6{ox?jWuR(;pg+f@mT>#HGWHAJRRDDDv~@(IDw&R>9643kK#HN`!1vBJHnC+RM&yIh8{gG2q zA%e*U3|N0XSRa~oX-3EAneep)@{h2vvd3Xvy$7og(sayr@95+e6~Xvi1tUqnIxoIH zVWo*OwYElb#uyW{Imam6f2rGbjR!Y3`#gPqkv57dB6K^wRGxc9B(t|aYDGS=m$&S!NmCtrMMaUg(c zc2qC=2Z`EEFMW-me5B)24AqF*bV5Dr-M5ig(l-WPS%CgaPzs6p_gnCIvTJ=Y<6!gT zVt@AfYCzjjsMEGi=rDQHo0yc;HqoRNnNFeWZgcm?f;cp(6CNylj36DoL(?TS7eU#+ z7&mfr#y))+CJOXQKUMZ7QIdS9@#-}7y2K1{8)cCt0~-X0O!O?Qx#E4Og+;A2SjalQ zs7r?qn0H044=sDN$SRG$arw~n=+T_DNdSrarmu)V6@|?1-ZB#hRn`uilTGPJ@fqEy zGt(f0B+^JDP&f=r{#Y_wi#AVDf-y!RIXU^0jXsFpf>=Ji*TeqSY!H~AMbJdCGLhC) zn7Rx+sXw6uYj;WRYrLd^5IZq@6JI1C^YkgnedZEYy<&4(z%Q$5yv#Boo{AH8n$a zhb4Y3PWdr269&?V%uI$xMcUrMzl=;w<_nm*qr=c3Rl@i5wWB;e-`t7D&c-mcQl7x! zZWB`UGcw=Y2=}~wzrfLx=uet<;m3~=8I~ZRuzvMQUQdr+yTV|ATf1Uuomr__nDf=X zZ3WYJtHp_ri(}SQAPjv+Y+0=fH4krOP@S&=zZ-t1jW1o@}z;xk8 z(Nz1co&El^HK^NrhVHa-_;&88vTU>_J33=%{if;BEY*J#1n59=07jrGQ#IP>@u#3A z;!q+E1Rj3ZJ+!4bq9F8PXJ@yMgZL;>&gYA0%_Kbi8?S=XGM~dnQZQ!yBSgcZhY96H zrWnU;k)qy`rX&&xlDyA%(a1Hhi5CWkmg(`Gb%m(HKi-7Z!LKGRP_B8@`7&hdDy5n= z`OIxqxiVfX@OX1p(mQu>0Ai*v_cTMiw4qRt3~NBvr9oBy0)r>w3p~V0SCm=An6@3n)>@z!|o-$HvDK z|3D2ZMJkLE5loMKl6R^ez@Zz%S$&mbeoqH5`Bb){Ei21q&VP)hWS2tjShfFtGE+$z zzCR$P#uktu+#!w)cX!lWN1XU%K-r=s{|j?)Akf@q#3b#{6cZCuJ~gCxuMXRmI$nGtnH+-h z+GEi!*X=AP<|fG`1>MBdTb?28JYc=fGvAi2I<$B(rs$;eoJCyR6_bc~p!XR@O-+sD z=eH`-ye})I5ic1eL~TDmtfJ|8`0VJ*Yr=hNCd)G1p2MMz4C3^Mj?7;!w|Ly%JqmuW zlIEW^Ft%z?*|fpXda>Jr^1noFZEwFgVV%|*XhH@acv8rdGxeEX{M$(vG{Zw+x(ei@ zmfXb22}8-?Fi`vo-YVrTH*C?a8%M=Hv9MqVH7H^J$KsD?>!SFZ;ZsvnHr_gn=7acz z#W?0eCdVhVMWN12VV^$>WlQ?f;P^{(&pYTops|btm6aj>_Uz+hqpGwB)vWp0Cf5y< zft8-je~nn?W11plq}N)4A{l8I7$!ks_x$PXW-2XaRFswX_BnF{R#6YIwMhAgd5F9X zGmwdadS6(a^fjHtXg8=l?Rc0Sm%hk6E9!5cLVloEy4eh(=FwgP`)~I^5~pBEWo+F6 zSf2ncyMurJN91#cJTy_u8Y}@%!bq1RkGC~-bV@SXRd4F{R-*V`bS+6;W5vZ(&+I<9$;-V|eNfLa5n-6% z2(}&uGRF;p92eS*sE*oR$@pexaqr*meB)VhmIg@h{uzkk$9~qh#cHhw#>O%)b@+(| z^IQgqzuj~Sk(J;swEM-3TrJAPCq9k^^^`q{IItKBRXYe}e0Tdr=Huf7da3$l4PdpwWDop%^}n;dD#K4s#DYA8SHZ z&1!riV4W4R7R#C))JH1~axJ)RYnM$$lIR%6fIVA@zV{XVyx}C+a-Dt8Y9M)^KU0+H zR4IUb2CJ{Hg>CuaXtD50jB(_Tcx=Z$^WYu2u5kubqmwp%drJ6 z?Fo40g!Qd<-l=TQxqHEOuPX0;^z7iX?Ke^a%XT<13TA^5`4Xcw6D@Ur&VT&CUe0d} z1GjOVF1^L@>O)l@?bD~$wzgf(nxX1OGD8fEV?TdJcZc2KoUe|oP1#=$$7ee|xbY)A zDZq+cuTpc(fFdj^=!;{k03C69lMQ(|>uhRfRu%+!k&YOi-3|1QKB z z?n?eq1XP>p-IM$Z^C;2L3itnbJZAip*Zo0aw2bs8@(s^~*8T9go!%dHcAz2lM;`yp zD=7&xjFV$S&5uDaiScyD?B-i1ze`+CoRtz`Wn+Zl&#s4&}MO{@N!ufrzjG$B79)Y2d3tBk&)TxUTw@QS0TEL_?njX|@vq?Uz(nBFK5Pq7*xj#u*R&i|?7+6# z+|r_n#SW&LXhtheZdah{ZVoqwyT{D>MC3nkFF#N)xLi{p7J1jXlmVeb;cP5?e(=f# zuT7fvjSbjS781v?7{)-X3*?>tq?)Yd)~|1{BDS(pqC zC}~H#WXlkUW*H5CDOo<)#x7%RY)A;ShGhI5s*#cRDA8YgqG(HeKDx+#(ZQ?386dv! zlXCO)w91~Vw4AmOcATuV653fa9R$fyK8ul%rG z-wfS zihugoZyr38Im?Zuh6@RcF~t1anQu7>#lPpb#}4cOA!EM11`%f*07RqOVkmX{p~KJ9 z^zP;K#|)$`^Rb{rnHGH{~>1(fawV0*Z#)}M`m8-?ZJV<+e}s9wE# z)l&az?w^5{)`S(%MRzxdNqrs1n*-=jS^_jqE*5XDrA0+VE`5^*p3CuM<&dZEeCjoz zR;uu_H9ZPZV|fQq`Cyw4nscrVwi!fE6ciMmX$!_hN7uF;jjKG)d2@aC4ropY)8etW=xJvni)8eHi`H$%#zn^WJ5NLc-rqk|u&&4Z6fD_m&JfSI1Bvb?b<*n&sfl0^t z=HnmRl`XrFvMKB%9}>PaA`m-fK6a0(8=qPkWS5bb4=v?XcWi&hRY?O5HdulRi4?fN zlsJ*N-0Qw+Yic@s0(2uy%F@ib;GjXt01Fmx5XbRo6+n|pP(&nodMoap^z{~q ziEeaUT@Mxe3vJSfI6?uLND(CNr=#^W<1b}jzW58bIfyWTDle$mmS(|x-0|2UlX+9k zQ^EX7Nw}?EzVoBfT(-LT|=9N@^hcn-_p&sqG z&*oVs2JSU+N4ZD`FhCAWaS;>|wH2G*Id|?pa#@>tyxX`+4HyIArWDvVrX)2WAOQff z0qyHu&-S@i^MS-+j--!pr4fPBj~_8({~e1bfcl0wI1kaoN>mJL6KUPQm5N7lB(ui1 zE-o%kq)&djzWJ}ob<-GfDlkB;F31j-VHKvQUGQ3sp`CwyGJk_i!y^sD0fqC@$9|jO zOqN!r!8-p==F@ZVP=U$qSpY(gQ0)59P1&t@y?5rvg<}E+GB}26NYPp4f2YFQrQtot5mn3wu_qprZ=>Ig-$ zbW26Ws~IgY>}^5w`vTB(G`PTZaDiGBo5o(tp)qli|NeV( z@H_=R8V39rt5J5YB2Ky?4eJJ#b`_iBe2ot~6%7mLt5t8Vwi^Jy7|jWXqa3amOIoRb zOr}WVFP--DsS`1WpN%~)t3R!arKF^Q$e12KEqU36AWwnCBICpH4XCsfnyrHr>$I$4 z!DpKX$OKLWarN7nv@!uIA+~RNO)l$$w}p(;b>mx8pwYvu;dD_unryX_NhT8*Tj>BTrTTL&!?O+%Rv;b?B??gSzdp?6Uug9{ zd@V08Z$BdI?fpoCS$)t4mg4rT8Q_I}h`0d-vYZ^|dOB*Q^S|xqTV*vIg?@fVFSmMpaw0qtTRbx} z({Pg?#{2`sc9)M5N$*N|4;^t$+QP?#mov zGVC@I*lBVrOU-%2y!7%)fAKjpEFsgQc4{amtiHb95KQEwvf<(3T<9-Zm$xIew#P22 zc2Ix|App^>v6(3L_MCU0d3W##AB0M~3D00EWoKZqsJYT(#@w$Y_H7G22M~ApVFTRHMI_3be)Lkn#0F*V8Pq zc}`Cjy$bE;FJ6H7p=0y#R>`}-m4(0F>%@P|?7fx{=R^uFdISRnZ2W_xQhD{YuR3t< z{6yxu=4~JkeA;|(J6_nv#>Nvs&FuLA&PW^he@t(UwFFE8)|a!R{`E`K`i^ZnyE4$k z;(749Ix|oi$c3QbEJ3b~D_kQsPz~fIUKym($a_7dJ?o+40*OLl^{=&oq$<#Q(yyrp z{J-FAniyAw9tPbe&IhQ|a`DqFTVQGQ&Gq3!C2==4x{6EJwiPZ8zub-iXoUtkJiG{} zPaR&}_fn8_z~(=;5lD-aPWD3z8PZS@AaUiomF!G8I}Mf>e~0g#BelA-5#`cj;O5>N Xviia!U7SGha1wx#SCgwmn*{w2TRX*I literal 0 HcmV?d00001 diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py new file mode 100644 index 00000000..7bdf742c --- /dev/null +++ b/document_ocr/tests/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import test_document_ocr diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py new file mode 100644 index 00000000..b1695da8 --- /dev/null +++ b/document_ocr/tests/test_document_ocr.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from StringIO import StringIO + +from PIL import Image, ImageDraw, ImageFont +from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE +from openerp.tests.common import TransactionCase + + +class TestDocumentOcr(TransactionCase): + def test_document_ocr(self): + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'True') + test_image = Image.new('RGB', (200, 30)) + draw = ImageDraw.Draw(test_image) + draw.text((3, 3), "Hello world", font=ImageFont.truetype( + '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24)) + # test a plain image + data = StringIO() + test_image.save(data, 'png') + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.png', None) + self.assertEqual(result[1].strip(), 'Hello world') + # should also work for pdfs + data = StringIO() + test_image.save(data, 'pdf', resolution=300) + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.pdf', None) + self.assertEqual(result[1].strip(), 'Hello world') + # check cron + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'False') + attachment = self.env['ir.attachment'].create({ + 'name': 'testattachment', + 'datas': data.getvalue().encode('base64'), + }) + self.assertEqual(attachment.index_content, _MARKER_PHRASE) + attachment._ocr_cron() + self.assertEqual(attachment.index_content.strip(), 'Hello world') + # and for an unreadable image, we expect an error + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'True') + data = StringIO() + test_image = Image.new('1', (200, 30)) + test_image.save(data, 'Palm') + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.palm', None) + self.assertEqual(result[1], None) From a58c40621cec20937b68cc943218d3a563f51228 Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Thu, 1 Jun 2017 20:03:58 +0100 Subject: [PATCH 02/16] [MIG] Migration of document_ocr module to 10.0 --- document_ocr/README.rst | 15 ++ document_ocr/__init__.py | 1 + document_ocr/__manifest__.py | 6 +- document_ocr/data/ir_config_parameter.xml | 8 + document_ocr/models/__init__.py | 1 + document_ocr/models/ir_attachment.py | 263 ++++++++++++++++++---- document_ocr/tests/__init__.py | 1 + document_ocr/tests/test_document_ocr.py | 5 +- document_ocr/views/ir_attachment_view.xml | 43 ++++ 9 files changed, 293 insertions(+), 50 deletions(-) create mode 100644 document_ocr/views/ir_attachment_view.xml diff --git a/document_ocr/README.rst b/document_ocr/README.rst index 7f9c3b28..c500f0b1 100644 --- a/document_ocr/README.rst +++ b/document_ocr/README.rst @@ -39,6 +39,21 @@ This is because the recognition process takes a while and you don't want to make The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. + +By default, recognition language is set to english. +In case you want to use a different default, set configuration parameter ``document_ocr.language`` to value respective value ex:``por``, for Portuguese. + + +In PDF case, OCR will run after it will be converted to an image. But OCR will be applied to all PDFs. + + +System parameters used: +#``document_ocr.synchronous``: bool +#``document_ocr.language``: string +#``document_ocr.dpi``: integer +#``document_ocr.quality``: integer + + .. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas :alt: Try me on Runbot :target: https://runbot.odoo-community.org/runbot/118/10.0 diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py index 7eda98a2..472456b6 100644 --- a/document_ocr/__init__.py +++ b/document_ocr/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import models diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py index 382e77d6..39d783d1 100644 --- a/document_ocr/__manifest__.py +++ b/document_ocr/__manifest__.py @@ -2,9 +2,9 @@ # © 2016 Therp BV # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). { - "name": "OCR for documents", + "name": "OCR for Documents", "version": "10.0.1.0.0", - "author": "Therp BV,Odoo Community Association (OCA), TKO Brasil", + "author": "Therp BV, Odoo Community Association (OCA), ThinkOpen Solutions Brasil", "license": "AGPL-3", "category": "Knowledge Management", "summary": "Run character recognition on uploaded files", @@ -14,10 +14,12 @@ "data": [ "data/ir_cron.xml", "data/ir_config_parameter.xml", + "views/ir_attachment_view.xml", ], "external_dependencies": { 'bin': [ 'tesseract', + 'convert', ], }, } diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml index e46db18a..721a0740 100644 --- a/document_ocr/data/ir_config_parameter.xml +++ b/document_ocr/data/ir_config_parameter.xml @@ -9,5 +9,13 @@ document_ocr.dpi 300 + + document_ocr.quality + 100 + + + document_ocr.language + eng + diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py index a15f1b21..051b3ddf 100644 --- a/document_ocr/models/__init__.py +++ b/document_ocr/models/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import ir_attachment diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py index b27992c8..f28e1fc9 100644 --- a/document_ocr/models/ir_attachment.py +++ b/document_ocr/models/ir_attachment.py @@ -1,85 +1,256 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +import io import logging import subprocess from StringIO import StringIO -from PIL import Image -from openerp import api, models +import pyPdf +from odoo import api, fields, models +from odoo.exceptions import UserError _logger = logging.getLogger(__name__) _MARKER_PHRASE = '[[waiting for OCR]]' +OCR_LANGUAGE = [('afr', 'Afrikaans'), + ('amh', 'Amharic'), + ('ara', 'Arabic'), + ('asm', 'Assamese'), + ('aze', 'Azerbaijani'), + ('aze_cyrl', 'Azerbaijani - Cyrilic'), + ('bel', 'Belarusian'), + ('ben', 'Bengali'), + ('bod', 'Tibetan'), + ('bos', 'Bosnian'), + ('bul', 'Bulgarian'), + ('cat', 'Catalan; Valencian'), + ('ceb', 'Cebuano'), + ('ces', 'Czech'), + ('chi_sim', 'Chinese - Simplified'), + ('chi_tra', 'Chinese - Traditional'), + ('chr', 'Cherokee'), + ('cym', 'Welsh'), + ('dan', 'Danish'), + ('dan_frak', 'Danish - Fraktur'), + ('deu', 'German'), + ('deu_frak', 'German - Fraktur'), + ('dzo', 'Dzongkha'), + ('ell', 'Greek, Modern (1453-)'), + ('eng', 'English'), + ('enm', 'English, Middle (1100-1500)'), + ('epo', 'Esperanto'), + ('equ', 'Math / equation detection module'), + ('est', 'Estonian'), + ('eus', 'Basque'), + ('fas', 'Persian'), + ('fin', 'Finnish'), + ('fra', 'French'), + ('frk', 'Frankish'), + ('frm', 'French, Middle (ca.1400-1600)'), + ('gle', 'Irish'), + ('glg', 'Galician'), + ('grc', 'Greek, Ancient (to 1453)'), + ('guj', 'Gujarati'), + ('hat', 'Haitian; Haitian Creole'), + ('heb', 'Hebrew'), + ('hin', 'Hindi'), + ('hrv', 'Croatian'), + ('hun', 'Hungarian'), + ('iku', 'Inuktitut'), + ('ind', 'Indonesian'), + ('isl', 'Icelandic'), + ('ita', 'Italian'), + ('ita_old', 'Italian - Old'), + ('jav', 'Javanese'), + ('jpn', 'Japanese'), + ('kan', 'Kannada'), + ('kat', 'Georgian'), + ('kat_old', 'Georgian - Old'), + ('kaz', 'Kazakh'), + ('khm', 'Central Khmer'), + ('kir', 'Kirghiz; Kyrgyz'), + ('kor', 'Korean'), + ('kur', 'Kurdish'), + ('lao', 'Lao'), + ('lat', 'Latin'), + ('lav', 'Latvian'), + ('lit', 'Lithuanian'), + ('mal', 'Malayalam'), + ('mar', 'Marathi'), + ('mkd', 'Macedonian'), + ('mlt', 'Maltese'), + ('msa', 'Malay'), + ('mya', 'Burmese'), + ('nep', 'Nepali'), + ('nld', 'Dutch; Flemish'), + ('nor', 'Norwegian'), + ('ori', 'Oriya'), + ('osd', 'Orientation and script detection module'), + ('pan', 'Panjabi; Punjabi'), + ('pol', 'Polish'), + ('por', 'Portuguese'), + ('pus', 'Pushto; Pashto'), + ('ron', 'Romanian; Moldavian; Moldovan'), + ('rus', 'Russian'), + ('san', 'Sanskrit'), + ('sin', 'Sinhala; Sinhalese'), + ('slk', 'Slovak'), + ('slk_frak', 'Slovak - Fraktur'), + ('slv', 'Slovenian'), + ('spa', 'Spanish; Castilian'), + ('spa_old', 'Spanish; Castilian - Old'), + ('sqi', 'Albanian'), + ('srp', 'Serbian'), + ('srp_latn', 'Serbian - Latin'), + ('swa', 'Swahili'), + ('swe', 'Swedish'), + ('syr', 'Syriac'), + ('tam', 'Tamil'), + ('tel', 'Telugu'), + ('tgk', 'Tajik'), + ('tgl', 'Tagalog'), + ('tha', 'Thai'), + ('tir', 'Tigrinya'), + ('tur', 'Turkish'), + ('uig', 'Uighur; Uyghur'), + ('ukr', 'Ukrainian'), + ('urd', 'Urdu'), + ('uzb', 'Uzbek'), + ('uzb_cyrl', 'Uzbek - Cyrilic'), + ('vie', 'Vietnamese'), + ('yid', 'Yiddish'), ] class IrAttachment(models.Model): _inherit = 'ir.attachment' + language = fields.Selection(OCR_LANGUAGE, 'Language') + # We need to redefine index_content field to be able to update it + # on the onchange_language() + index_content = fields.Text('Indexed Content', readonly=False, prefetch=False) + index_content_rel = fields.Text(related='index_content', string='Indexed Content Rel') + + @api.onchange('language') + def onchange_language(self): + process = subprocess.Popen(['tesseract', '--list-langs'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + if self.language not in stderr.split('\n'): + raise UserError( + "Language not installed." + " Please ask your system administrator to" + " install tesseract '%s' language." % + self.language) + if self.store_fname: + bin_data = self._file_read(self.store_fname) + else: + bin_data = self.db_datas + index_content = self._index( + bin_data.decode('base64'), self.datas_fname, self.mimetype) + return {'value': { + 'index_content': index_content}} + @api.model - def _index(self, data, datas_fname, file_type): - mimetype, content = super(IrAttachment, self)._index( - data, datas_fname, file_type) + def _index(self, bin_data, datas_fname, mimetype): + if not self.language: + # Set default language + self.language = self.env['ir.config_parameter'].get_param( + 'document_ocr.language', 'eng') + content = super(IrAttachment, self)._index( + bin_data, datas_fname, mimetype) if not content or content == 'image': has_synchr_param = self.env['ir.config_parameter'].get_param( 'document_ocr.synchronous', 'False') == 'True' has_force_flag = self.env.context.get('document_ocr_force') - if has_synchr_param or has_force_flag: - content = self._index_ocr(mimetype, data, datas_fname, - file_type) + synchr = has_synchr_param or has_force_flag + if synchr: + content = self._index_ocr(bin_data) else: content = _MARKER_PHRASE + return content - return mimetype, content - - @api.model - def _index_ocr(self, mimetype, data, datas_fname, file_type): - dpi = int( - self.env['ir.config_parameter'].get_param( - 'document_ocr.dpi', '500')) - top_type, sub_type = mimetype.split('/', 1) - if hasattr(self, '_index_ocr_get_data_%s' % sub_type): - image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)( - data, datas_fname, file_type, dpi) - else: - image_data = StringIO() - try: - Image.open(StringIO(data)).save(image_data, 'tiff', - dpi=(dpi, dpi)) - except IOError: - _logger.exception('Failed to OCR image') - return None + def _index_ocr(self, bin_data): process = subprocess.Popen( - ['tesseract', 'stdin', 'stdout'], + ['tesseract', 'stdin', 'stdout', '-l', self.language], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - stdout, stderr = process.communicate(image_data.getvalue()) + stdout, stderr = process.communicate(bin_data) if stderr: _logger.error('Error during OCR: %s', stderr) return stdout - @api.model - def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi): - process = subprocess.Popen( - ['convert', '-density', str(dpi), '-', '-append', 'png32:-'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate(data) - if stderr: - _logger.error('Error converting to PDF: %s', stderr) - return StringIO(stdout) + def _index_pdf(self, bin_data): + + def convert_bin_to_image(self, bin_data): + dpi = int(self.env['ir.config_parameter'].get_param( + 'document_ocr.dpi', '500')) + quality = int(self.env['ir.config_parameter'].get_param( + 'document_ocr.quality', '100')) + process = subprocess.Popen( + ['convert', '-density', str(dpi), + '-quality', str(quality), + '-', '-append', 'png32:-'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = process.communicate(bin_data) + if stderr: + _logger.error('Error converting PDF to image: %s', stderr) + return stdout + + def _convert_pdf_page_to_image(self, pdf, pagenum): + dst_pdf = pyPdf.PdfFileWriter() + dst_pdf.addPage(pdf.getPage(pagenum)) + pdf_bytes = io.BytesIO() + dst_pdf.write(pdf_bytes) + pdf_bytes.seek(0) + return convert_bin_to_image(self, pdf_bytes.read()) + + has_synchr_param = self.env['ir.config_parameter'].get_param( + 'document_ocr.synchronous', 'False') == 'True' + has_force_flag = self.env.context.get('document_ocr_force') + synchr = has_synchr_param or has_force_flag + if synchr: + buf = super(IrAttachment, self)._index_pdf(bin_data) + if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'): + # If we got less than 2 lines, run OCR and append to existent text + try: + f = StringIO(bin_data) + pdf = pyPdf.PdfFileReader(f) + if pdf.getNumPages() > 1: + for pagenum in range(0, pdf.getNumPages()): + _logger.info('OCR PDF "%s" page %d/%d...', + self.datas_fname, + pagenum + 1, + pdf.getNumPages()) + pdf_image = _convert_pdf_page_to_image(self, pdf, + pagenum) + index_content = self._index_ocr(pdf_image) + buf = u'%s\n-- %d --\n%s' % ( + buf, pagenum + 1, index_content.decode('utf8')) + else: + _logger.info('OCR PDF "%s"...', self.datas_fname) + pdf_image = convert_bin_to_image(self, bin_data) + index_content = self._index_ocr(pdf_image) + buf = u'%s\n%s' % (buf, index_content.decode('utf8')) + except Exception as e: + _logger.error('Error converting PDF to image: %s', e) + pass + else: + buf = _MARKER_PHRASE + return buf @api.model def _ocr_cron(self): - for this in self.with_context(document_ocr_force=True).search([ - ('index_content', '=', _MARKER_PHRASE), - ]): + for this in self.with_context(document_ocr_force=True).search( + [('index_content', '=', _MARKER_PHRASE)]): if not this.datas: continue - file_type, index_content = this._index( - this.datas.decode('base64'), this.datas_fname, this.file_type) + index_content = this._index( + this.datas.decode('base64'), this.datas_fname, this.mimetype) this.write({ - 'file_type': file_type, 'index_content': index_content, }) diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py index 7bdf742c..7efb2857 100644 --- a/document_ocr/tests/__init__.py +++ b/document_ocr/tests/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import test_document_ocr diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index b1695da8..e54a6ac0 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from StringIO import StringIO from PIL import Image, ImageDraw, ImageFont -from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE -from openerp.tests.common import TransactionCase +from odoo.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE +from odoo.tests.common import TransactionCase class TestDocumentOcr(TransactionCase): diff --git a/document_ocr/views/ir_attachment_view.xml b/document_ocr/views/ir_attachment_view.xml new file mode 100644 index 00000000..ed171d61 --- /dev/null +++ b/document_ocr/views/ir_attachment_view.xml @@ -0,0 +1,43 @@ + + + + + ir.attachment + + + + 1 + + + + + + + + + + + + + + ir.attachment + + + + + + + + + ir.attachment + + + + + + + + + + + From caf585626671545005775e4a284af2408be8ef3c Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Fri, 2 Jun 2017 10:03:34 +0100 Subject: [PATCH 03/16] Fixes Flake8 errors Add requirements.txt for tesseract --- document_ocr/__manifest__.py | 4 +++- document_ocr/models/ir_attachment.py | 18 +++++++++++------- document_ocr/tests/test_document_ocr.py | 2 +- requirements.txt | 1 + 4 files changed, 16 insertions(+), 9 deletions(-) create mode 100644 requirements.txt diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py index 39d783d1..ad012794 100644 --- a/document_ocr/__manifest__.py +++ b/document_ocr/__manifest__.py @@ -4,7 +4,9 @@ { "name": "OCR for Documents", "version": "10.0.1.0.0", - "author": "Therp BV, Odoo Community Association (OCA), ThinkOpen Solutions Brasil", + "author": "Therp BV," + " Odoo Community Association (OCA)," + " ThinkOpen Solutions Brasil", "license": "AGPL-3", "category": "Knowledge Management", "summary": "Run character recognition on uploaded files", diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py index f28e1fc9..ef683a37 100644 --- a/document_ocr/models/ir_attachment.py +++ b/document_ocr/models/ir_attachment.py @@ -9,7 +9,7 @@ import subprocess from StringIO import StringIO import pyPdf -from odoo import api, fields, models +from odoo import api, fields, models, _ from odoo.exceptions import UserError _logger = logging.getLogger(__name__) @@ -126,11 +126,14 @@ OCR_LANGUAGE = [('afr', 'Afrikaans'), class IrAttachment(models.Model): _inherit = 'ir.attachment' - language = fields.Selection(OCR_LANGUAGE, 'Language') + language = fields.Selection(OCR_LANGUAGE, _('Language')) # We need to redefine index_content field to be able to update it # on the onchange_language() - index_content = fields.Text('Indexed Content', readonly=False, prefetch=False) - index_content_rel = fields.Text(related='index_content', string='Indexed Content Rel') + index_content = fields.Text(_('Indexed Content'), + readonly=False, + prefetch=False) + index_content_rel = fields.Text(related='index_content', + string=_('Indexed Content Rel')) @api.onchange('language') def onchange_language(self): @@ -139,11 +142,11 @@ class IrAttachment(models.Model): stderr=subprocess.PIPE) stdout, stderr = process.communicate() if self.language not in stderr.split('\n'): - raise UserError( + raise UserError(_( "Language not installed." " Please ask your system administrator to" " install tesseract '%s' language." % - self.language) + self.language)) if self.store_fname: bin_data = self._file_read(self.store_fname) else: @@ -216,7 +219,8 @@ class IrAttachment(models.Model): if synchr: buf = super(IrAttachment, self)._index_pdf(bin_data) if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'): - # If we got less than 2 lines, run OCR and append to existent text + # If we got less than 2 lines, + # run OCR anyway and append to existent text try: f = StringIO(bin_data) pdf = pyPdf.PdfFileReader(f) diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index e54a6ac0..10c253c6 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -5,7 +5,7 @@ from StringIO import StringIO from PIL import Image, ImageDraw, ImageFont -from odoo.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE +from models.ir_attachment import _MARKER_PHRASE from odoo.tests.common import TransactionCase diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..943dea2b --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +tesseract \ No newline at end of file From ca0eed717a31b5593db38159b1ba411c1b072ea1 Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Fri, 2 Jun 2017 10:14:40 +0100 Subject: [PATCH 04/16] Remove _() from fields --- document_ocr/models/ir_attachment.py | 6 +++--- document_ocr/tests/test_document_ocr.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py index ef683a37..7247d3e4 100644 --- a/document_ocr/models/ir_attachment.py +++ b/document_ocr/models/ir_attachment.py @@ -126,14 +126,14 @@ OCR_LANGUAGE = [('afr', 'Afrikaans'), class IrAttachment(models.Model): _inherit = 'ir.attachment' - language = fields.Selection(OCR_LANGUAGE, _('Language')) + language = fields.Selection(OCR_LANGUAGE, 'Language') # We need to redefine index_content field to be able to update it # on the onchange_language() - index_content = fields.Text(_('Indexed Content'), + index_content = fields.Text('Indexed Content', readonly=False, prefetch=False) index_content_rel = fields.Text(related='index_content', - string=_('Indexed Content Rel')) + string='Indexed Content Rel') @api.onchange('language') def onchange_language(self): diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index 10c253c6..8a72a9d9 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -5,7 +5,7 @@ from StringIO import StringIO from PIL import Image, ImageDraw, ImageFont -from models.ir_attachment import _MARKER_PHRASE +from document_ocr.models.ir_attachment import _MARKER_PHRASE from odoo.tests.common import TransactionCase From 324a53d4c144cc1be8aaa50a804764b80424825e Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Fri, 2 Jun 2017 10:18:10 +0100 Subject: [PATCH 05/16] Fix requirements tesseract dependency library name --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 943dea2b..3f0cdf37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -tesseract \ No newline at end of file +pytesseract \ No newline at end of file From 125d76d37d860271eac3b770a495d7fa40a1ce35 Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Fri, 2 Jun 2017 10:28:57 +0100 Subject: [PATCH 06/16] Fix attachment import --- document_ocr/tests/test_document_ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index 8a72a9d9..393dbfa4 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -5,7 +5,7 @@ from StringIO import StringIO from PIL import Image, ImageDraw, ImageFont -from document_ocr.models.ir_attachment import _MARKER_PHRASE +from ir_attachment import _MARKER_PHRASE from odoo.tests.common import TransactionCase From 87b07980544cfddf0780f1cc4c78604df78aed75 Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Fri, 2 Jun 2017 10:32:36 +0100 Subject: [PATCH 07/16] Fix requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3f0cdf37..3a0b698d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -pytesseract \ No newline at end of file + tesseract-ocr + tesseract-ocr-eng From 5a3a2e8d60f85f7d9af59dcfb98839317e19ecef Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Fri, 2 Jun 2017 10:39:29 +0100 Subject: [PATCH 08/16] Fix import reference --- document_ocr/tests/test_document_ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index 393dbfa4..9765e291 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -5,7 +5,7 @@ from StringIO import StringIO from PIL import Image, ImageDraw, ImageFont -from ir_attachment import _MARKER_PHRASE +from ..models.ir_attachment import _MARKER_PHRASE from odoo.tests.common import TransactionCase From 94cfc53685cab12a9a6aca2e957b4ce060c9ddac Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Mon, 5 Jun 2017 15:45:23 +0100 Subject: [PATCH 09/16] Update travis.yml --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 7c091932..0c92807d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,10 @@ addons: packages: - expect-dev # provides unbuffer utility - python-lxml # because pip installation is slow + - tesseract-ocr # document_ocr + - tesseract-ocr-eng # document_ocr + - imagemagick # document_ocr + - fonts-inconsolata # document_ocr (for tests only) language: python From 0c2740de22b56fc5e77dd9d837e8f0ebfc8afcee Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Mon, 5 Jun 2017 15:50:19 +0100 Subject: [PATCH 10/16] Remove unecessary requirements.txt file, dependency added as package in travis.yml --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 3a0b698d..00000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ - tesseract-ocr - tesseract-ocr-eng From 21526bd236b7b4a3054aaee65ce8c3e7b28a3dbf Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Tue, 6 Jun 2017 11:48:55 +0100 Subject: [PATCH 11/16] Make tests to work, on 10.0 Fix small bugs --- document_ocr/models/ir_attachment.py | 19 ++++++++++--------- document_ocr/tests/test_document_ocr.py | 23 +++++++++++++---------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py index 7247d3e4..efbd9b18 100644 --- a/document_ocr/models/ir_attachment.py +++ b/document_ocr/models/ir_attachment.py @@ -126,7 +126,10 @@ OCR_LANGUAGE = [('afr', 'Afrikaans'), class IrAttachment(models.Model): _inherit = 'ir.attachment' - language = fields.Selection(OCR_LANGUAGE, 'Language') + language = fields.Selection(OCR_LANGUAGE, 'Language', + default=lambda self: + self.env['ir.config_parameter'].get_param( + 'document_ocr.language', 'eng')) # We need to redefine index_content field to be able to update it # on the onchange_language() index_content = fields.Text('Indexed Content', @@ -151,17 +154,15 @@ class IrAttachment(models.Model): bin_data = self._file_read(self.store_fname) else: bin_data = self.db_datas - index_content = self._index( - bin_data.decode('base64'), self.datas_fname, self.mimetype) - return {'value': { - 'index_content': index_content}} + if bin_data: + index_content = self._index( + bin_data.decode('base64'), self.datas_fname, self.mimetype) + return {'value': { + 'index_content': index_content}} + return {'value': {}} @api.model def _index(self, bin_data, datas_fname, mimetype): - if not self.language: - # Set default language - self.language = self.env['ir.config_parameter'].get_param( - 'document_ocr.language', 'eng') content = super(IrAttachment, self)._index( bin_data, datas_fname, mimetype) if not content or content == 'image': diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index 9765e291..fa5c6137 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -4,10 +4,11 @@ # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from StringIO import StringIO -from PIL import Image, ImageDraw, ImageFont -from ..models.ir_attachment import _MARKER_PHRASE +from PIL import Image, ImageDraw, ImageFont, PdfImagePlugin, PalmImagePlugin from odoo.tests.common import TransactionCase +from ..models.ir_attachment import _MARKER_PHRASE + class TestDocumentOcr(TransactionCase): def test_document_ocr(self): @@ -20,15 +21,17 @@ class TestDocumentOcr(TransactionCase): # test a plain image data = StringIO() test_image.save(data, 'png') - result = self.env['ir.attachment']._index( + attachment = self.env['ir.attachment'].create({ + 'name': 'testattachment'}) + result = attachment._index( data.getvalue(), 'test.png', None) - self.assertEqual(result[1].strip(), 'Hello world') + self.assertEqual(result.strip(), 'Hello world') # should also work for pdfs data = StringIO() test_image.save(data, 'pdf', resolution=300) - result = self.env['ir.attachment']._index( + result = attachment._index( data.getvalue(), 'test.pdf', None) - self.assertEqual(result[1].strip(), 'Hello world') + self.assertEqual(result.strip(), 'Hello world') # check cron self.env['ir.config_parameter'].set_param( 'document_ocr.synchronous', 'False') @@ -39,12 +42,12 @@ class TestDocumentOcr(TransactionCase): self.assertEqual(attachment.index_content, _MARKER_PHRASE) attachment._ocr_cron() self.assertEqual(attachment.index_content.strip(), 'Hello world') - # and for an unreadable image, we expect an error + # and for an unreadable image, we expect an empty string self.env['ir.config_parameter'].set_param( 'document_ocr.synchronous', 'True') data = StringIO() test_image = Image.new('1', (200, 30)) - test_image.save(data, 'Palm') - result = self.env['ir.attachment']._index( + test_image.save(data, 'palm') + result = attachment._index( data.getvalue(), 'test.palm', None) - self.assertEqual(result[1], None) + self.assertEqual(result, '') From f1c5c8238b54bae3afda69c512f150d0cb75ee4c Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Tue, 6 Jun 2017 11:59:56 +0100 Subject: [PATCH 12/16] Unused import flake8 error --- document_ocr/tests/test_document_ocr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index fa5c6137..d2bff780 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -28,6 +28,7 @@ class TestDocumentOcr(TransactionCase): self.assertEqual(result.strip(), 'Hello world') # should also work for pdfs data = StringIO() + PdfImagePlugin # to use import :/ test_image.save(data, 'pdf', resolution=300) result = attachment._index( data.getvalue(), 'test.pdf', None) @@ -47,6 +48,7 @@ class TestDocumentOcr(TransactionCase): 'document_ocr.synchronous', 'True') data = StringIO() test_image = Image.new('1', (200, 30)) + PalmImagePlugin # to use import :/ test_image.save(data, 'palm') result = attachment._index( data.getvalue(), 'test.palm', None) From b22221023c75e0e464f02d531b4dfcc127062c63 Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Tue, 6 Jun 2017 12:53:08 +0100 Subject: [PATCH 13/16] Deal with unused import --- document_ocr/tests/test_document_ocr.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index d2bff780..bf9c9b36 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -4,7 +4,8 @@ # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from StringIO import StringIO -from PIL import Image, ImageDraw, ImageFont, PdfImagePlugin, PalmImagePlugin +from PIL import Image, ImageDraw, ImageFont +from PIL import PdfImagePlugin, PalmImagePlugin # pylint: disable=unused-import from odoo.tests.common import TransactionCase from ..models.ir_attachment import _MARKER_PHRASE @@ -28,7 +29,6 @@ class TestDocumentOcr(TransactionCase): self.assertEqual(result.strip(), 'Hello world') # should also work for pdfs data = StringIO() - PdfImagePlugin # to use import :/ test_image.save(data, 'pdf', resolution=300) result = attachment._index( data.getvalue(), 'test.pdf', None) @@ -48,7 +48,6 @@ class TestDocumentOcr(TransactionCase): 'document_ocr.synchronous', 'True') data = StringIO() test_image = Image.new('1', (200, 30)) - PalmImagePlugin # to use import :/ test_image.save(data, 'palm') result = attachment._index( data.getvalue(), 'test.palm', None) From f7903b10da70bc6bbf030567754d30a736c558d3 Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Tue, 6 Jun 2017 12:55:35 +0100 Subject: [PATCH 14/16] Also ignore unused import for flake8 --- document_ocr/tests/test_document_ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index bf9c9b36..3d4cf69f 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -5,7 +5,7 @@ from StringIO import StringIO from PIL import Image, ImageDraw, ImageFont -from PIL import PdfImagePlugin, PalmImagePlugin # pylint: disable=unused-import +from PIL import PdfImagePlugin, PalmImagePlugin # noqa # pylint: disable=unused-import from odoo.tests.common import TransactionCase from ..models.ir_attachment import _MARKER_PHRASE From f80491a589708cb41b3f8d67a498c0f0023b543e Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Tue, 6 Jun 2017 14:13:14 +0100 Subject: [PATCH 15/16] Improve logger messages --- document_ocr/models/ir_attachment.py | 2 +- document_ocr/tests/test_document_ocr.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py index efbd9b18..18102d52 100644 --- a/document_ocr/models/ir_attachment.py +++ b/document_ocr/models/ir_attachment.py @@ -177,6 +177,7 @@ class IrAttachment(models.Model): return content def _index_ocr(self, bin_data): + _logger.info('OCR IMAGE "%s"...', self.datas_fname) process = subprocess.Popen( ['tesseract', 'stdin', 'stdout', '-l', self.language], stdin=subprocess.PIPE, stdout=subprocess.PIPE, @@ -237,7 +238,6 @@ class IrAttachment(models.Model): buf = u'%s\n-- %d --\n%s' % ( buf, pagenum + 1, index_content.decode('utf8')) else: - _logger.info('OCR PDF "%s"...', self.datas_fname) pdf_image = convert_bin_to_image(self, bin_data) index_content = self._index_ocr(pdf_image) buf = u'%s\n%s' % (buf, index_content.decode('utf8')) diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index 3d4cf69f..1d1a5490 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -23,13 +23,17 @@ class TestDocumentOcr(TransactionCase): data = StringIO() test_image.save(data, 'png') attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment'}) + 'name': 'testattachment', + 'datas_fname': 'test_png.pdf'}) result = attachment._index( data.getvalue(), 'test.png', None) self.assertEqual(result.strip(), 'Hello world') # should also work for pdfs data = StringIO() test_image.save(data, 'pdf', resolution=300) + attachment = self.env['ir.attachment'].create({ + 'name': 'testattachment', + 'datas_fname': 'test_pdf.pdf'}) result = attachment._index( data.getvalue(), 'test.pdf', None) self.assertEqual(result.strip(), 'Hello world') @@ -38,6 +42,7 @@ class TestDocumentOcr(TransactionCase): 'document_ocr.synchronous', 'False') attachment = self.env['ir.attachment'].create({ 'name': 'testattachment', + 'datas_fname': 'test_cron.pdf', 'datas': data.getvalue().encode('base64'), }) self.assertEqual(attachment.index_content, _MARKER_PHRASE) @@ -49,6 +54,9 @@ class TestDocumentOcr(TransactionCase): data = StringIO() test_image = Image.new('1', (200, 30)) test_image.save(data, 'palm') + attachment = self.env['ir.attachment'].create({ + 'name': 'testattachment', + 'datas_fname': 'test_err.palm'}) result = attachment._index( data.getvalue(), 'test.palm', None) self.assertEqual(result, '') From 3b3c60b348b44c390d048984c64e1fa444553df2 Mon Sep 17 00:00:00 2001 From: Carlos Almeida Date: Tue, 6 Jun 2017 15:55:09 +0100 Subject: [PATCH 16/16] Remove migrated document_ocr module --- document_ocr/README.rst | 101 --------- document_ocr/__init__.py | 5 - document_ocr/__manifest__.py | 27 --- document_ocr/data/ir_config_parameter.xml | 21 -- document_ocr/data/ir_cron.xml | 13 -- document_ocr/models/__init__.py | 5 - document_ocr/models/ir_attachment.py | 261 ---------------------- document_ocr/static/description/icon.png | Bin 9455 -> 0 bytes document_ocr/tests/__init__.py | 5 - document_ocr/tests/test_document_ocr.py | 62 ----- document_ocr/views/ir_attachment_view.xml | 43 ---- 11 files changed, 543 deletions(-) delete mode 100644 document_ocr/README.rst delete mode 100644 document_ocr/__init__.py delete mode 100644 document_ocr/__manifest__.py delete mode 100644 document_ocr/data/ir_config_parameter.xml delete mode 100644 document_ocr/data/ir_cron.xml delete mode 100644 document_ocr/models/__init__.py delete mode 100644 document_ocr/models/ir_attachment.py delete mode 100644 document_ocr/static/description/icon.png delete mode 100644 document_ocr/tests/__init__.py delete mode 100644 document_ocr/tests/test_document_ocr.py delete mode 100644 document_ocr/views/ir_attachment_view.xml diff --git a/document_ocr/README.rst b/document_ocr/README.rst deleted file mode 100644 index c500f0b1..00000000 --- a/document_ocr/README.rst +++ /dev/null @@ -1,101 +0,0 @@ -.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg - :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html - :alt: License: AGPL-3 - -================= -OCR for documents -================= - -This module was written to make uploaded documents, for example scans, searchable by running OCR on them. - -It supports all image formats `Pillow supports `_ for reading and PDFs. - -Installation -============ - -To install this module, you need to: - -#. install tesseract and the language(s) your documents use -#. if you want to support OCR on PDFs, install imagemagick -#. install the module itself - -On an Debian or Ubuntu system you would typically run:: - - $ sudo apt-get install tesseract-ocr imagemagick - - -Configuration -============= - -To configure this module, go to: - -#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.* - -Usage -===== - -By default, character recognition is done asynchronously by a cronjob at night. -This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish. -The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. -In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. - - -By default, recognition language is set to english. -In case you want to use a different default, set configuration parameter ``document_ocr.language`` to value respective value ex:``por``, for Portuguese. - - -In PDF case, OCR will run after it will be converted to an image. But OCR will be applied to all PDFs. - - -System parameters used: -#``document_ocr.synchronous``: bool -#``document_ocr.language``: string -#``document_ocr.dpi``: integer -#``document_ocr.quality``: integer - - -.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas - :alt: Try me on Runbot - :target: https://runbot.odoo-community.org/runbot/118/10.0 - -Bug Tracker -=========== - -Bugs are tracked on `GitHub Issues `_. -In case of trouble, please check there if your issue has already been reported. -If you spotted it first, help us smashing it by providing a detailed and welcomed feedback. - -Credits -======= - -The actual work ---------------- - -* `tesseract `_ - -Images ------- - -* Odoo Community Association: `Icon `_. - -Contributors ------------- - -* Holger Brunn - -Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list `_ or the `appropriate specialized mailinglist `_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues. - -Maintainer ----------- - -.. image:: https://odoo-community.org/logo.png - :alt: Odoo Community Association - :target: https://odoo-community.org - -This module is maintained by the OCA. - -OCA, or the Odoo Community Association, is a nonprofit organization whose -mission is to support the collaborative development of Odoo features and -promote its widespread use. - -To contribute to this module, please visit https://odoo-community.org. diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py deleted file mode 100644 index 472456b6..00000000 --- a/document_ocr/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from . import models diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py deleted file mode 100644 index ad012794..00000000 --- a/document_ocr/__manifest__.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -{ - "name": "OCR for Documents", - "version": "10.0.1.0.0", - "author": "Therp BV," - " Odoo Community Association (OCA)," - " ThinkOpen Solutions Brasil", - "license": "AGPL-3", - "category": "Knowledge Management", - "summary": "Run character recognition on uploaded files", - "depends": [ - 'document', - ], - "data": [ - "data/ir_cron.xml", - "data/ir_config_parameter.xml", - "views/ir_attachment_view.xml", - ], - "external_dependencies": { - 'bin': [ - 'tesseract', - 'convert', - ], - }, -} diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml deleted file mode 100644 index 721a0740..00000000 --- a/document_ocr/data/ir_config_parameter.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - document_ocr.synchronous - False - - - document_ocr.dpi - 300 - - - document_ocr.quality - 100 - - - document_ocr.language - eng - - - diff --git a/document_ocr/data/ir_cron.xml b/document_ocr/data/ir_cron.xml deleted file mode 100644 index f69d151a..00000000 --- a/document_ocr/data/ir_cron.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - Run OCR on uploaded documents - days - 1 - ir.attachment - _ocr_cron - -1 - - - diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py deleted file mode 100644 index 051b3ddf..00000000 --- a/document_ocr/models/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from . import ir_attachment diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py deleted file mode 100644 index 18102d52..00000000 --- a/document_ocr/models/ir_attachment.py +++ /dev/null @@ -1,261 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). - -import io -import logging -import subprocess -from StringIO import StringIO - -import pyPdf -from odoo import api, fields, models, _ -from odoo.exceptions import UserError - -_logger = logging.getLogger(__name__) -_MARKER_PHRASE = '[[waiting for OCR]]' -OCR_LANGUAGE = [('afr', 'Afrikaans'), - ('amh', 'Amharic'), - ('ara', 'Arabic'), - ('asm', 'Assamese'), - ('aze', 'Azerbaijani'), - ('aze_cyrl', 'Azerbaijani - Cyrilic'), - ('bel', 'Belarusian'), - ('ben', 'Bengali'), - ('bod', 'Tibetan'), - ('bos', 'Bosnian'), - ('bul', 'Bulgarian'), - ('cat', 'Catalan; Valencian'), - ('ceb', 'Cebuano'), - ('ces', 'Czech'), - ('chi_sim', 'Chinese - Simplified'), - ('chi_tra', 'Chinese - Traditional'), - ('chr', 'Cherokee'), - ('cym', 'Welsh'), - ('dan', 'Danish'), - ('dan_frak', 'Danish - Fraktur'), - ('deu', 'German'), - ('deu_frak', 'German - Fraktur'), - ('dzo', 'Dzongkha'), - ('ell', 'Greek, Modern (1453-)'), - ('eng', 'English'), - ('enm', 'English, Middle (1100-1500)'), - ('epo', 'Esperanto'), - ('equ', 'Math / equation detection module'), - ('est', 'Estonian'), - ('eus', 'Basque'), - ('fas', 'Persian'), - ('fin', 'Finnish'), - ('fra', 'French'), - ('frk', 'Frankish'), - ('frm', 'French, Middle (ca.1400-1600)'), - ('gle', 'Irish'), - ('glg', 'Galician'), - ('grc', 'Greek, Ancient (to 1453)'), - ('guj', 'Gujarati'), - ('hat', 'Haitian; Haitian Creole'), - ('heb', 'Hebrew'), - ('hin', 'Hindi'), - ('hrv', 'Croatian'), - ('hun', 'Hungarian'), - ('iku', 'Inuktitut'), - ('ind', 'Indonesian'), - ('isl', 'Icelandic'), - ('ita', 'Italian'), - ('ita_old', 'Italian - Old'), - ('jav', 'Javanese'), - ('jpn', 'Japanese'), - ('kan', 'Kannada'), - ('kat', 'Georgian'), - ('kat_old', 'Georgian - Old'), - ('kaz', 'Kazakh'), - ('khm', 'Central Khmer'), - ('kir', 'Kirghiz; Kyrgyz'), - ('kor', 'Korean'), - ('kur', 'Kurdish'), - ('lao', 'Lao'), - ('lat', 'Latin'), - ('lav', 'Latvian'), - ('lit', 'Lithuanian'), - ('mal', 'Malayalam'), - ('mar', 'Marathi'), - ('mkd', 'Macedonian'), - ('mlt', 'Maltese'), - ('msa', 'Malay'), - ('mya', 'Burmese'), - ('nep', 'Nepali'), - ('nld', 'Dutch; Flemish'), - ('nor', 'Norwegian'), - ('ori', 'Oriya'), - ('osd', 'Orientation and script detection module'), - ('pan', 'Panjabi; Punjabi'), - ('pol', 'Polish'), - ('por', 'Portuguese'), - ('pus', 'Pushto; Pashto'), - ('ron', 'Romanian; Moldavian; Moldovan'), - ('rus', 'Russian'), - ('san', 'Sanskrit'), - ('sin', 'Sinhala; Sinhalese'), - ('slk', 'Slovak'), - ('slk_frak', 'Slovak - Fraktur'), - ('slv', 'Slovenian'), - ('spa', 'Spanish; Castilian'), - ('spa_old', 'Spanish; Castilian - Old'), - ('sqi', 'Albanian'), - ('srp', 'Serbian'), - ('srp_latn', 'Serbian - Latin'), - ('swa', 'Swahili'), - ('swe', 'Swedish'), - ('syr', 'Syriac'), - ('tam', 'Tamil'), - ('tel', 'Telugu'), - ('tgk', 'Tajik'), - ('tgl', 'Tagalog'), - ('tha', 'Thai'), - ('tir', 'Tigrinya'), - ('tur', 'Turkish'), - ('uig', 'Uighur; Uyghur'), - ('ukr', 'Ukrainian'), - ('urd', 'Urdu'), - ('uzb', 'Uzbek'), - ('uzb_cyrl', 'Uzbek - Cyrilic'), - ('vie', 'Vietnamese'), - ('yid', 'Yiddish'), ] - - -class IrAttachment(models.Model): - _inherit = 'ir.attachment' - - language = fields.Selection(OCR_LANGUAGE, 'Language', - default=lambda self: - self.env['ir.config_parameter'].get_param( - 'document_ocr.language', 'eng')) - # We need to redefine index_content field to be able to update it - # on the onchange_language() - index_content = fields.Text('Indexed Content', - readonly=False, - prefetch=False) - index_content_rel = fields.Text(related='index_content', - string='Indexed Content Rel') - - @api.onchange('language') - def onchange_language(self): - process = subprocess.Popen(['tesseract', '--list-langs'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = process.communicate() - if self.language not in stderr.split('\n'): - raise UserError(_( - "Language not installed." - " Please ask your system administrator to" - " install tesseract '%s' language." % - self.language)) - if self.store_fname: - bin_data = self._file_read(self.store_fname) - else: - bin_data = self.db_datas - if bin_data: - index_content = self._index( - bin_data.decode('base64'), self.datas_fname, self.mimetype) - return {'value': { - 'index_content': index_content}} - return {'value': {}} - - @api.model - def _index(self, bin_data, datas_fname, mimetype): - content = super(IrAttachment, self)._index( - bin_data, datas_fname, mimetype) - if not content or content == 'image': - has_synchr_param = self.env['ir.config_parameter'].get_param( - 'document_ocr.synchronous', 'False') == 'True' - has_force_flag = self.env.context.get('document_ocr_force') - synchr = has_synchr_param or has_force_flag - if synchr: - content = self._index_ocr(bin_data) - else: - content = _MARKER_PHRASE - return content - - def _index_ocr(self, bin_data): - _logger.info('OCR IMAGE "%s"...', self.datas_fname) - process = subprocess.Popen( - ['tesseract', 'stdin', 'stdout', '-l', self.language], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate(bin_data) - if stderr: - _logger.error('Error during OCR: %s', stderr) - return stdout - - def _index_pdf(self, bin_data): - - def convert_bin_to_image(self, bin_data): - dpi = int(self.env['ir.config_parameter'].get_param( - 'document_ocr.dpi', '500')) - quality = int(self.env['ir.config_parameter'].get_param( - 'document_ocr.quality', '100')) - process = subprocess.Popen( - ['convert', '-density', str(dpi), - '-quality', str(quality), - '-', '-append', 'png32:-'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = process.communicate(bin_data) - if stderr: - _logger.error('Error converting PDF to image: %s', stderr) - return stdout - - def _convert_pdf_page_to_image(self, pdf, pagenum): - dst_pdf = pyPdf.PdfFileWriter() - dst_pdf.addPage(pdf.getPage(pagenum)) - pdf_bytes = io.BytesIO() - dst_pdf.write(pdf_bytes) - pdf_bytes.seek(0) - return convert_bin_to_image(self, pdf_bytes.read()) - - has_synchr_param = self.env['ir.config_parameter'].get_param( - 'document_ocr.synchronous', 'False') == 'True' - has_force_flag = self.env.context.get('document_ocr_force') - synchr = has_synchr_param or has_force_flag - if synchr: - buf = super(IrAttachment, self)._index_pdf(bin_data) - if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'): - # If we got less than 2 lines, - # run OCR anyway and append to existent text - try: - f = StringIO(bin_data) - pdf = pyPdf.PdfFileReader(f) - if pdf.getNumPages() > 1: - for pagenum in range(0, pdf.getNumPages()): - _logger.info('OCR PDF "%s" page %d/%d...', - self.datas_fname, - pagenum + 1, - pdf.getNumPages()) - pdf_image = _convert_pdf_page_to_image(self, pdf, - pagenum) - index_content = self._index_ocr(pdf_image) - buf = u'%s\n-- %d --\n%s' % ( - buf, pagenum + 1, index_content.decode('utf8')) - else: - pdf_image = convert_bin_to_image(self, bin_data) - index_content = self._index_ocr(pdf_image) - buf = u'%s\n%s' % (buf, index_content.decode('utf8')) - except Exception as e: - _logger.error('Error converting PDF to image: %s', e) - pass - else: - buf = _MARKER_PHRASE - return buf - - @api.model - def _ocr_cron(self): - for this in self.with_context(document_ocr_force=True).search( - [('index_content', '=', _MARKER_PHRASE)]): - if not this.datas: - continue - index_content = this._index( - this.datas.decode('base64'), this.datas_fname, this.mimetype) - this.write({ - 'index_content': index_content, - }) diff --git a/document_ocr/static/description/icon.png b/document_ocr/static/description/icon.png deleted file mode 100644 index 3a0328b516c4980e8e44cdb63fd945757ddd132d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9455 zcmW++2RxMjAAjx~&dlBk9S+%}OXg)AGE&Cb*&}d0jUxM@u(PQx^-s)697TX`ehR4?GS^qbkof1cslKgkU)h65qZ9Oc=ml_0temigYLJfnz{IDzUf>bGs4N!v3=Z3jMq&A#7%rM5eQ#dc?k~! zVpnB`o+K7|Al`Q_U;eD$B zfJtP*jH`siUq~{KE)`jP2|#TUEFGRryE2`i0**z#*^6~AI|YzIWy$Cu#CSLW3q=GA z6`?GZymC;dCPk~rBS%eCb`5OLr;RUZ;D`}um=H)BfVIq%7VhiMr)_#G0N#zrNH|__ zc+blN2UAB0=617@>_u;MPHN;P;N#YoE=)R#i$k_`UAA>WWCcEVMh~L_ zj--gtp&|K1#58Yz*AHCTMziU1Jzt_jG0I@qAOHsk$2}yTmVkBp_eHuY$A9)>P6o~I z%aQ?!(GqeQ-Y+b0I(m9pwgi(IIZZzsbMv+9w{PFtd_<_(LA~0H(xz{=FhLB@(1&qHA5EJw1>>=%q2f&^X>IQ{!GJ4e9U z&KlB)z(84HmNgm2hg2C0>WM{E(DdPr+EeU_N@57;PC2&DmGFW_9kP&%?X4}+xWi)( z;)z%wI5>D4a*5XwD)P--sPkoY(a~WBw;E~AW`Yue4kFa^LM3X`8x|}ZUeMnqr}>kH zG%WWW>3ml$Yez?i%)2pbKPI7?5o?hydokgQyZsNEr{a|mLdt;X2TX(#B1j35xPnPW z*bMSSOauW>o;*=kO8ojw91VX!qoOQb)zHJ!odWB}d+*K?#sY_jqPdg{Sm2HdYzdEx zOGVPhVRTGPtv0o}RfVP;Nd(|CB)I;*t&QO8h zFfekr30S!-LHmV_Su-W+rEwYXJ^;6&3|L$mMC8*bQptyOo9;>Qb9Q9`ySe3%V$A*9 zeKEe+b0{#KWGp$F+tga)0RtI)nhMa-K@JS}2krK~n8vJ=Ngm?R!9G<~RyuU0d?nz# z-5EK$o(!F?hmX*2Yt6+coY`6jGbb7tF#6nHA zuKk=GGJ;ZwON1iAfG$E#Y7MnZVmrY|j0eVI(DN_MNFJmyZ|;w4tf@=CCDZ#5N_0K= z$;R~bbk?}TpfDjfB&aiQ$VA}s?P}xPERJG{kxk5~R`iRS(SK5d+Xs9swCozZISbnS zk!)I0>t=A<-^z(cmSFz3=jZ23u13X><0b)P)^1T_))Kr`e!-pb#q&J*Q`p+B6la%C zuVl&0duN<;uOsB3%T9Fp8t{ED108<+W(nOZd?gDnfNBC3>M8WE61$So|P zVvqH0SNtDTcsUdzaMDpT=Ty0pDHHNL@Z0w$Y`XO z2M-_r1S+GaH%pz#Uy0*w$Vdl=X=rQXEzO}d6J^R6zjM1u&c9vYLvLp?W7w(?np9x1 zE_0JSAJCPB%i7p*Wvg)pn5T`8k3-uR?*NT|J`eS#_#54p>!p(mLDvmc-3o0mX*mp_ zN*AeS<>#^-{S%W<*mz^!X$w_2dHWpcJ6^j64qFBft-o}o_Vx80o0>}Du;>kLts;$8 zC`7q$QI(dKYG`Wa8#wl@V4jVWBRGQ@1dr-hstpQL)Tl+aqVpGpbSfN>5i&QMXfiZ> zaA?T1VGe?rpQ@;+pkrVdd{klI&jVS@I5_iz!=UMpTsa~mBga?1r}aRBm1WS;TT*s0f0lY=JBl66Upy)-k4J}lh=P^8(SXk~0xW=T9v*B|gzIhN z>qsO7dFd~mgxAy4V?&)=5ieYq?zi?ZEoj)&2o)RLy=@hbCRcfT5jigwtQGE{L*8<@Yd{zg;CsL5mvzfDY}P-wos_6PfprFVaeqNE%h zKZhLtcQld;ZD+>=nqN~>GvROfueSzJD&BE*}XfU|H&(FssBqY=hPCt`d zH?@s2>I(|;fcW&YM6#V#!kUIP8$Nkdh0A(bEVj``-AAyYgwY~jB zT|I7Bf@%;7aL7Wf4dZ%VqF$eiaC38OV6oy3Z#TER2G+fOCd9Iaoy6aLYbPTN{XRPz z;U!V|vBf%H!}52L2gH_+j;`bTcQRXB+y9onc^wLm5wi3-Be}U>k_u>2Eg$=k!(l@I zcCg+flakT2Nej3i0yn+g+}%NYb?ta;R?(g5SnwsQ49U8Wng8d|{B+lyRcEDvR3+`O{zfmrmvFrL6acVP%yG98X zo&+VBg@px@i)%o?dG(`T;n*$S5*rnyiR#=wW}}GsAcfyQpE|>a{=$Hjg=-*_K;UtD z#z-)AXwSRY?OPefw^iI+ z)AXz#PfEjlwTes|_{sB?4(O@fg0AJ^g8gP}ex9Ucf*@_^J(s_5jJV}c)s$`Myn|Kd z$6>}#q^n{4vN@+Os$m7KV+`}c%4)4pv@06af4-x5#wj!KKb%caK{A&Y#Rfs z-po?Dcb1({W=6FKIUirH&(yg=*6aLCekcKwyfK^JN5{wcA3nhO(o}SK#!CINhI`-I z1)6&n7O&ZmyFMuNwvEic#IiOAwNkR=u5it{B9n2sAJV5pNhar=j5`*N!Na;c7g!l$ z3aYBqUkqqTJ=Re-;)s!EOeij=7SQZ3Hq}ZRds%IM*PtM$wV z@;rlc*NRK7i3y5BETSKuumEN`Xu_8GP1Ri=OKQ$@I^ko8>H6)4rjiG5{VBM>B|%`&&s^)jS|-_95&yc=GqjNo{zFkw%%HHhS~e=s zD#sfS+-?*t|J!+ozP6KvtOl!R)@@-z24}`9{QaVLD^9VCSR2b`b!KC#o;Ki<+wXB6 zx3&O0LOWcg4&rv4QG0)4yb}7BFSEg~=IR5#ZRj8kg}dS7_V&^%#Do==#`u zpy6{ox?jWuR(;pg+f@mT>#HGWHAJRRDDDv~@(IDw&R>9643kK#HN`!1vBJHnC+RM&yIh8{gG2q zA%e*U3|N0XSRa~oX-3EAneep)@{h2vvd3Xvy$7og(sayr@95+e6~Xvi1tUqnIxoIH zVWo*OwYElb#uyW{Imam6f2rGbjR!Y3`#gPqkv57dB6K^wRGxc9B(t|aYDGS=m$&S!NmCtrMMaUg(c zc2qC=2Z`EEFMW-me5B)24AqF*bV5Dr-M5ig(l-WPS%CgaPzs6p_gnCIvTJ=Y<6!gT zVt@AfYCzjjsMEGi=rDQHo0yc;HqoRNnNFeWZgcm?f;cp(6CNylj36DoL(?TS7eU#+ z7&mfr#y))+CJOXQKUMZ7QIdS9@#-}7y2K1{8)cCt0~-X0O!O?Qx#E4Og+;A2SjalQ zs7r?qn0H044=sDN$SRG$arw~n=+T_DNdSrarmu)V6@|?1-ZB#hRn`uilTGPJ@fqEy zGt(f0B+^JDP&f=r{#Y_wi#AVDf-y!RIXU^0jXsFpf>=Ji*TeqSY!H~AMbJdCGLhC) zn7Rx+sXw6uYj;WRYrLd^5IZq@6JI1C^YkgnedZEYy<&4(z%Q$5yv#Boo{AH8n$a zhb4Y3PWdr269&?V%uI$xMcUrMzl=;w<_nm*qr=c3Rl@i5wWB;e-`t7D&c-mcQl7x! zZWB`UGcw=Y2=}~wzrfLx=uet<;m3~=8I~ZRuzvMQUQdr+yTV|ATf1Uuomr__nDf=X zZ3WYJtHp_ri(}SQAPjv+Y+0=fH4krOP@S&=zZ-t1jW1o@}z;xk8 z(Nz1co&El^HK^NrhVHa-_;&88vTU>_J33=%{if;BEY*J#1n59=07jrGQ#IP>@u#3A z;!q+E1Rj3ZJ+!4bq9F8PXJ@yMgZL;>&gYA0%_Kbi8?S=XGM~dnQZQ!yBSgcZhY96H zrWnU;k)qy`rX&&xlDyA%(a1Hhi5CWkmg(`Gb%m(HKi-7Z!LKGRP_B8@`7&hdDy5n= z`OIxqxiVfX@OX1p(mQu>0Ai*v_cTMiw4qRt3~NBvr9oBy0)r>w3p~V0SCm=An6@3n)>@z!|o-$HvDK z|3D2ZMJkLE5loMKl6R^ez@Zz%S$&mbeoqH5`Bb){Ei21q&VP)hWS2tjShfFtGE+$z zzCR$P#uktu+#!w)cX!lWN1XU%K-r=s{|j?)Akf@q#3b#{6cZCuJ~gCxuMXRmI$nGtnH+-h z+GEi!*X=AP<|fG`1>MBdTb?28JYc=fGvAi2I<$B(rs$;eoJCyR6_bc~p!XR@O-+sD z=eH`-ye})I5ic1eL~TDmtfJ|8`0VJ*Yr=hNCd)G1p2MMz4C3^Mj?7;!w|Ly%JqmuW zlIEW^Ft%z?*|fpXda>Jr^1noFZEwFgVV%|*XhH@acv8rdGxeEX{M$(vG{Zw+x(ei@ zmfXb22}8-?Fi`vo-YVrTH*C?a8%M=Hv9MqVH7H^J$KsD?>!SFZ;ZsvnHr_gn=7acz z#W?0eCdVhVMWN12VV^$>WlQ?f;P^{(&pYTops|btm6aj>_Uz+hqpGwB)vWp0Cf5y< zft8-je~nn?W11plq}N)4A{l8I7$!ks_x$PXW-2XaRFswX_BnF{R#6YIwMhAgd5F9X zGmwdadS6(a^fjHtXg8=l?Rc0Sm%hk6E9!5cLVloEy4eh(=FwgP`)~I^5~pBEWo+F6 zSf2ncyMurJN91#cJTy_u8Y}@%!bq1RkGC~-bV@SXRd4F{R-*V`bS+6;W5vZ(&+I<9$;-V|eNfLa5n-6% z2(}&uGRF;p92eS*sE*oR$@pexaqr*meB)VhmIg@h{uzkk$9~qh#cHhw#>O%)b@+(| z^IQgqzuj~Sk(J;swEM-3TrJAPCq9k^^^`q{IItKBRXYe}e0Tdr=Huf7da3$l4PdpwWDop%^}n;dD#K4s#DYA8SHZ z&1!riV4W4R7R#C))JH1~axJ)RYnM$$lIR%6fIVA@zV{XVyx}C+a-Dt8Y9M)^KU0+H zR4IUb2CJ{Hg>CuaXtD50jB(_Tcx=Z$^WYu2u5kubqmwp%drJ6 z?Fo40g!Qd<-l=TQxqHEOuPX0;^z7iX?Ke^a%XT<13TA^5`4Xcw6D@Ur&VT&CUe0d} z1GjOVF1^L@>O)l@?bD~$wzgf(nxX1OGD8fEV?TdJcZc2KoUe|oP1#=$$7ee|xbY)A zDZq+cuTpc(fFdj^=!;{k03C69lMQ(|>uhRfRu%+!k&YOi-3|1QKB z z?n?eq1XP>p-IM$Z^C;2L3itnbJZAip*Zo0aw2bs8@(s^~*8T9go!%dHcAz2lM;`yp zD=7&xjFV$S&5uDaiScyD?B-i1ze`+CoRtz`Wn+Zl&#s4&}MO{@N!ufrzjG$B79)Y2d3tBk&)TxUTw@QS0TEL_?njX|@vq?Uz(nBFK5Pq7*xj#u*R&i|?7+6# z+|r_n#SW&LXhtheZdah{ZVoqwyT{D>MC3nkFF#N)xLi{p7J1jXlmVeb;cP5?e(=f# zuT7fvjSbjS781v?7{)-X3*?>tq?)Yd)~|1{BDS(pqC zC}~H#WXlkUW*H5CDOo<)#x7%RY)A;ShGhI5s*#cRDA8YgqG(HeKDx+#(ZQ?386dv! zlXCO)w91~Vw4AmOcATuV653fa9R$fyK8ul%rG z-wfS zihugoZyr38Im?Zuh6@RcF~t1anQu7>#lPpb#}4cOA!EM11`%f*07RqOVkmX{p~KJ9 z^zP;K#|)$`^Rb{rnHGH{~>1(fawV0*Z#)}M`m8-?ZJV<+e}s9wE# z)l&az?w^5{)`S(%MRzxdNqrs1n*-=jS^_jqE*5XDrA0+VE`5^*p3CuM<&dZEeCjoz zR;uu_H9ZPZV|fQq`Cyw4nscrVwi!fE6ciMmX$!_hN7uF;jjKG)d2@aC4ropY)8etW=xJvni)8eHi`H$%#zn^WJ5NLc-rqk|u&&4Z6fD_m&JfSI1Bvb?b<*n&sfl0^t z=HnmRl`XrFvMKB%9}>PaA`m-fK6a0(8=qPkWS5bb4=v?XcWi&hRY?O5HdulRi4?fN zlsJ*N-0Qw+Yic@s0(2uy%F@ib;GjXt01Fmx5XbRo6+n|pP(&nodMoap^z{~q ziEeaUT@Mxe3vJSfI6?uLND(CNr=#^W<1b}jzW58bIfyWTDle$mmS(|x-0|2UlX+9k zQ^EX7Nw}?EzVoBfT(-LT|=9N@^hcn-_p&sqG z&*oVs2JSU+N4ZD`FhCAWaS;>|wH2G*Id|?pa#@>tyxX`+4HyIArWDvVrX)2WAOQff z0qyHu&-S@i^MS-+j--!pr4fPBj~_8({~e1bfcl0wI1kaoN>mJL6KUPQm5N7lB(ui1 zE-o%kq)&djzWJ}ob<-GfDlkB;F31j-VHKvQUGQ3sp`CwyGJk_i!y^sD0fqC@$9|jO zOqN!r!8-p==F@ZVP=U$qSpY(gQ0)59P1&t@y?5rvg<}E+GB}26NYPp4f2YFQrQtot5mn3wu_qprZ=>Ig-$ zbW26Ws~IgY>}^5w`vTB(G`PTZaDiGBo5o(tp)qli|NeV( z@H_=R8V39rt5J5YB2Ky?4eJJ#b`_iBe2ot~6%7mLt5t8Vwi^Jy7|jWXqa3amOIoRb zOr}WVFP--DsS`1WpN%~)t3R!arKF^Q$e12KEqU36AWwnCBICpH4XCsfnyrHr>$I$4 z!DpKX$OKLWarN7nv@!uIA+~RNO)l$$w}p(;b>mx8pwYvu;dD_unryX_NhT8*Tj>BTrTTL&!?O+%Rv;b?B??gSzdp?6Uug9{ zd@V08Z$BdI?fpoCS$)t4mg4rT8Q_I}h`0d-vYZ^|dOB*Q^S|xqTV*vIg?@fVFSmMpaw0qtTRbx} z({Pg?#{2`sc9)M5N$*N|4;^t$+QP?#mov zGVC@I*lBVrOU-%2y!7%)fAKjpEFsgQc4{amtiHb95KQEwvf<(3T<9-Zm$xIew#P22 zc2Ix|App^>v6(3L_MCU0d3W##AB0M~3D00EWoKZqsJYT(#@w$Y_H7G22M~ApVFTRHMI_3be)Lkn#0F*V8Pq zc}`Cjy$bE;FJ6H7p=0y#R>`}-m4(0F>%@P|?7fx{=R^uFdISRnZ2W_xQhD{YuR3t< z{6yxu=4~JkeA;|(J6_nv#>Nvs&FuLA&PW^he@t(UwFFE8)|a!R{`E`K`i^ZnyE4$k z;(749Ix|oi$c3QbEJ3b~D_kQsPz~fIUKym($a_7dJ?o+40*OLl^{=&oq$<#Q(yyrp z{J-FAniyAw9tPbe&IhQ|a`DqFTVQGQ&Gq3!C2==4x{6EJwiPZ8zub-iXoUtkJiG{} zPaR&}_fn8_z~(=;5lD-aPWD3z8PZS@AaUiomF!G8I}Mf>e~0g#BelA-5#`cj;O5>N Xviia!U7SGha1wx#SCgwmn*{w2TRX*I diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py deleted file mode 100644 index 7efb2857..00000000 --- a/document_ocr/tests/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from . import test_document_ocr diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py deleted file mode 100644 index 1d1a5490..00000000 --- a/document_ocr/tests/test_document_ocr.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from StringIO import StringIO - -from PIL import Image, ImageDraw, ImageFont -from PIL import PdfImagePlugin, PalmImagePlugin # noqa # pylint: disable=unused-import -from odoo.tests.common import TransactionCase - -from ..models.ir_attachment import _MARKER_PHRASE - - -class TestDocumentOcr(TransactionCase): - def test_document_ocr(self): - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'True') - test_image = Image.new('RGB', (200, 30)) - draw = ImageDraw.Draw(test_image) - draw.text((3, 3), "Hello world", font=ImageFont.truetype( - '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24)) - # test a plain image - data = StringIO() - test_image.save(data, 'png') - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas_fname': 'test_png.pdf'}) - result = attachment._index( - data.getvalue(), 'test.png', None) - self.assertEqual(result.strip(), 'Hello world') - # should also work for pdfs - data = StringIO() - test_image.save(data, 'pdf', resolution=300) - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas_fname': 'test_pdf.pdf'}) - result = attachment._index( - data.getvalue(), 'test.pdf', None) - self.assertEqual(result.strip(), 'Hello world') - # check cron - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'False') - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas_fname': 'test_cron.pdf', - 'datas': data.getvalue().encode('base64'), - }) - self.assertEqual(attachment.index_content, _MARKER_PHRASE) - attachment._ocr_cron() - self.assertEqual(attachment.index_content.strip(), 'Hello world') - # and for an unreadable image, we expect an empty string - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'True') - data = StringIO() - test_image = Image.new('1', (200, 30)) - test_image.save(data, 'palm') - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas_fname': 'test_err.palm'}) - result = attachment._index( - data.getvalue(), 'test.palm', None) - self.assertEqual(result, '') diff --git a/document_ocr/views/ir_attachment_view.xml b/document_ocr/views/ir_attachment_view.xml deleted file mode 100644 index ed171d61..00000000 --- a/document_ocr/views/ir_attachment_view.xml +++ /dev/null @@ -1,43 +0,0 @@ - - - - - ir.attachment - - - - 1 - - - - - - - - - - - - - - ir.attachment - - - - - - - - - ir.attachment - - - - - - - - - - -