From d4a07e88c5438d3e667e47493e68e1c0ad539d3a Mon Sep 17 00:00:00 2001 From: Holger Brunn Date: Tue, 21 Jun 2016 18:30:19 +0200 Subject: [PATCH] [ADD] document_ocr --- .travis.yml | 6 +- document_ocr/README.rst | 86 ++++++++++++++++++++++ document_ocr/__init__.py | 4 + document_ocr/__openerp__.py | 23 ++++++ document_ocr/data/ir_config_parameter.xml | 13 ++++ document_ocr/data/ir_cron.xml | 13 ++++ document_ocr/models/__init__.py | 4 + document_ocr/models/ir_attachment.py | 84 +++++++++++++++++++++ document_ocr/static/description/icon.png | Bin 0 -> 9455 bytes document_ocr/tests/__init__.py | 4 + document_ocr/tests/test_document_ocr.py | 48 ++++++++++++ 11 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 document_ocr/README.rst create mode 100644 document_ocr/__init__.py create mode 100644 document_ocr/__openerp__.py create mode 100644 document_ocr/data/ir_config_parameter.xml create mode 100644 document_ocr/data/ir_cron.xml create mode 100644 document_ocr/models/__init__.py create mode 100644 document_ocr/models/ir_attachment.py create mode 100644 document_ocr/static/description/icon.png create mode 100644 document_ocr/tests/__init__.py create mode 100644 document_ocr/tests/test_document_ocr.py diff --git a/.travis.yml b/.travis.yml index da6f1dbc..e42da917 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,5 @@ -sudo: false +sudo: required +dist: trusty cache: pip addons: @@ -6,6 +7,9 @@ addons: packages: - expect-dev # provides unbuffer utility - python-lxml # because pip installation is slow + - tesseract-ocr # document_ocr + - imagemagick # document_ocr + - fonts-inconsolata # document_ocr (for tests only) language: python diff --git a/document_ocr/README.rst b/document_ocr/README.rst new file mode 100644 index 00000000..c4d667f8 --- /dev/null +++ b/document_ocr/README.rst @@ -0,0 +1,86 @@ +.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg + :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html + :alt: License: AGPL-3 + +================= +OCR for documents +================= + +This module was written to make uploaded documents, for example scans, searchable by running OCR on them. + +It supports all image formats `Pillow supports `_ for reading and PDFs. + +Installation +============ + +To install this module, you need to: + +#. install tesseract and the language(s) your documents use +#. if you want to support OCR on PDFs, install imagemagick +#. install the module itself + +On an Debian or Ubuntu system you would typically run:: + + $ sudo apt-get install tesseract-ocr imagemagick + + +Configuration +============= + +To configure this module, go to: + +#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.* + +Usage +===== + +By default, character recognition is done asynchronously by a cronjob at night. +This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish. +The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. +In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. + +.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas + :alt: Try me on Runbot + :target: https://runbot.odoo-community.org/runbot/118/8.0 + +Bug Tracker +=========== + +Bugs are tracked on `GitHub Issues `_. +In case of trouble, please check there if your issue has already been reported. +If you spotted it first, help us smashing it by providing a detailed and welcomed feedback. + +Credits +======= + +The actual work +--------------- + +* `tesseract `_ + +Images +------ + +* Odoo Community Association: `Icon `_. + +Contributors +------------ + +* Holger Brunn + +Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list `_ or the `appropriate specialized mailinglist `_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues. + +Maintainer +---------- + +.. image:: https://odoo-community.org/logo.png + :alt: Odoo Community Association + :target: https://odoo-community.org + +This module is maintained by the OCA. + +OCA, or the Odoo Community Association, is a nonprofit organization whose +mission is to support the collaborative development of Odoo features and +promote its widespread use. + +To contribute to this module, please visit https://odoo-community.org. diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py new file mode 100644 index 00000000..7eda98a2 --- /dev/null +++ b/document_ocr/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import models diff --git a/document_ocr/__openerp__.py b/document_ocr/__openerp__.py new file mode 100644 index 00000000..2c1a2696 --- /dev/null +++ b/document_ocr/__openerp__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +{ + "name": "OCR for documents", + "version": "8.0.1.0.0", + "author": "Therp BV,Odoo Community Association (OCA)", + "license": "AGPL-3", + "category": "Knowledge Management", + "summary": "Run character recognition on uploaded files", + "depends": [ + 'document', + ], + "data": [ + "data/ir_cron.xml", + "data/ir_config_parameter.xml", + ], + "external_dependencies": { + 'bin': [ + 'tesseract', + ], + }, +} diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml new file mode 100644 index 00000000..e46db18a --- /dev/null +++ b/document_ocr/data/ir_config_parameter.xml @@ -0,0 +1,13 @@ + + + + + document_ocr.synchronous + False + + + document_ocr.dpi + 300 + + + diff --git a/document_ocr/data/ir_cron.xml b/document_ocr/data/ir_cron.xml new file mode 100644 index 00000000..f69d151a --- /dev/null +++ b/document_ocr/data/ir_cron.xml @@ -0,0 +1,13 @@ + + + + + Run OCR on uploaded documents + days + 1 + ir.attachment + _ocr_cron + -1 + + + diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py new file mode 100644 index 00000000..a15f1b21 --- /dev/null +++ b/document_ocr/models/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import ir_attachment diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py new file mode 100644 index 00000000..ec161712 --- /dev/null +++ b/document_ocr/models/ir_attachment.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +import logging +import subprocess +from PIL import Image +from StringIO import StringIO +from openerp import api, models + +_logger = logging.getLogger(__name__) +_MARKER_PHRASE = '[[waiting for OCR]]' + + +class IrAttachment(models.Model): + _inherit = 'ir.attachment' + + @api.model + def _index(self, data, datas_fname, file_type): + mimetype, content = super(IrAttachment, self)._index( + data, datas_fname, file_type) + if not content or content == 'image': + has_synchr_param = self.env['ir.config_parameter'].get_param( + 'document_ocr.synchronous', 'False') == 'True' + has_force_flag = self.env.context.get('document_ocr_force') + if has_synchr_param or has_force_flag: + content = self._index_ocr(mimetype, data, datas_fname, + file_type) + else: + content = _MARKER_PHRASE + + return mimetype, content + + @api.model + def _index_ocr(self, mimetype, data, datas_fname, file_type): + dpi = int( + self.env['ir.config_parameter'].get_param( + 'document_ocr.dpi', '500')) + top_type, sub_type = mimetype.split('/', 1) + if hasattr(self, '_index_ocr_get_data_%s' % sub_type): + image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)( + data, datas_fname, file_type, dpi) + else: + image_data = StringIO() + try: + Image.open(StringIO(data)).save(image_data, 'tiff', + dpi=(dpi, dpi)) + except IOError: + _logger.exception('Failed to OCR image') + return None + process = subprocess.Popen( + ['tesseract', 'stdin', 'stdout'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(image_data.getvalue()) + if stderr: + _logger.error('Error during OCR: %s', stderr) + return stdout + + @api.model + def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi): + process = subprocess.Popen( + ['convert', '-density', str(dpi), '-', '-append', 'png32:-'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(data) + if stderr: + _logger.error('Error converting to PDF: %s', stderr) + return StringIO(stdout) + + @api.model + def _ocr_cron(self): + for this in self.with_context(document_ocr_force=True).search([ + ('index_content', '=', _MARKER_PHRASE), + ]): + if not this.datas: + continue + file_type, index_content = this._index( + this.datas.decode('base64'), this.datas_fname, this.file_type) + this.write({ + 'file_type': file_type, + 'index_content': index_content, + }) diff --git a/document_ocr/static/description/icon.png b/document_ocr/static/description/icon.png new file mode 100644 index 0000000000000000000000000000000000000000..3a0328b516c4980e8e44cdb63fd945757ddd132d GIT binary patch literal 9455 zcmW++2RxMjAAjx~&dlBk9S+%}OXg)AGE&Cb*&}d0jUxM@u(PQx^-s)697TX`ehR4?GS^qbkof1cslKgkU)h65qZ9Oc=ml_0temigYLJfnz{IDzUf>bGs4N!v3=Z3jMq&A#7%rM5eQ#dc?k~! zVpnB`o+K7|Al`Q_U;eD$B zfJtP*jH`siUq~{KE)`jP2|#TUEFGRryE2`i0**z#*^6~AI|YzIWy$Cu#CSLW3q=GA z6`?GZymC;dCPk~rBS%eCb`5OLr;RUZ;D`}um=H)BfVIq%7VhiMr)_#G0N#zrNH|__ zc+blN2UAB0=617@>_u;MPHN;P;N#YoE=)R#i$k_`UAA>WWCcEVMh~L_ zj--gtp&|K1#58Yz*AHCTMziU1Jzt_jG0I@qAOHsk$2}yTmVkBp_eHuY$A9)>P6o~I z%aQ?!(GqeQ-Y+b0I(m9pwgi(IIZZzsbMv+9w{PFtd_<_(LA~0H(xz{=FhLB@(1&qHA5EJw1>>=%q2f&^X>IQ{!GJ4e9U z&KlB)z(84HmNgm2hg2C0>WM{E(DdPr+EeU_N@57;PC2&DmGFW_9kP&%?X4}+xWi)( z;)z%wI5>D4a*5XwD)P--sPkoY(a~WBw;E~AW`Yue4kFa^LM3X`8x|}ZUeMnqr}>kH zG%WWW>3ml$Yez?i%)2pbKPI7?5o?hydokgQyZsNEr{a|mLdt;X2TX(#B1j35xPnPW z*bMSSOauW>o;*=kO8ojw91VX!qoOQb)zHJ!odWB}d+*K?#sY_jqPdg{Sm2HdYzdEx zOGVPhVRTGPtv0o}RfVP;Nd(|CB)I;*t&QO8h zFfekr30S!-LHmV_Su-W+rEwYXJ^;6&3|L$mMC8*bQptyOo9;>Qb9Q9`ySe3%V$A*9 zeKEe+b0{#KWGp$F+tga)0RtI)nhMa-K@JS}2krK~n8vJ=Ngm?R!9G<~RyuU0d?nz# z-5EK$o(!F?hmX*2Yt6+coY`6jGbb7tF#6nHA zuKk=GGJ;ZwON1iAfG$E#Y7MnZVmrY|j0eVI(DN_MNFJmyZ|;w4tf@=CCDZ#5N_0K= z$;R~bbk?}TpfDjfB&aiQ$VA}s?P}xPERJG{kxk5~R`iRS(SK5d+Xs9swCozZISbnS zk!)I0>t=A<-^z(cmSFz3=jZ23u13X><0b)P)^1T_))Kr`e!-pb#q&J*Q`p+B6la%C zuVl&0duN<;uOsB3%T9Fp8t{ED108<+W(nOZd?gDnfNBC3>M8WE61$So|P zVvqH0SNtDTcsUdzaMDpT=Ty0pDHHNL@Z0w$Y`XO z2M-_r1S+GaH%pz#Uy0*w$Vdl=X=rQXEzO}d6J^R6zjM1u&c9vYLvLp?W7w(?np9x1 zE_0JSAJCPB%i7p*Wvg)pn5T`8k3-uR?*NT|J`eS#_#54p>!p(mLDvmc-3o0mX*mp_ zN*AeS<>#^-{S%W<*mz^!X$w_2dHWpcJ6^j64qFBft-o}o_Vx80o0>}Du;>kLts;$8 zC`7q$QI(dKYG`Wa8#wl@V4jVWBRGQ@1dr-hstpQL)Tl+aqVpGpbSfN>5i&QMXfiZ> zaA?T1VGe?rpQ@;+pkrVdd{klI&jVS@I5_iz!=UMpTsa~mBga?1r}aRBm1WS;TT*s0f0lY=JBl66Upy)-k4J}lh=P^8(SXk~0xW=T9v*B|gzIhN z>qsO7dFd~mgxAy4V?&)=5ieYq?zi?ZEoj)&2o)RLy=@hbCRcfT5jigwtQGE{L*8<@Yd{zg;CsL5mvzfDY}P-wos_6PfprFVaeqNE%h zKZhLtcQld;ZD+>=nqN~>GvROfueSzJD&BE*}XfU|H&(FssBqY=hPCt`d zH?@s2>I(|;fcW&YM6#V#!kUIP8$Nkdh0A(bEVj``-AAyYgwY~jB zT|I7Bf@%;7aL7Wf4dZ%VqF$eiaC38OV6oy3Z#TER2G+fOCd9Iaoy6aLYbPTN{XRPz z;U!V|vBf%H!}52L2gH_+j;`bTcQRXB+y9onc^wLm5wi3-Be}U>k_u>2Eg$=k!(l@I zcCg+flakT2Nej3i0yn+g+}%NYb?ta;R?(g5SnwsQ49U8Wng8d|{B+lyRcEDvR3+`O{zfmrmvFrL6acVP%yG98X zo&+VBg@px@i)%o?dG(`T;n*$S5*rnyiR#=wW}}GsAcfyQpE|>a{=$Hjg=-*_K;UtD z#z-)AXwSRY?OPefw^iI+ z)AXz#PfEjlwTes|_{sB?4(O@fg0AJ^g8gP}ex9Ucf*@_^J(s_5jJV}c)s$`Myn|Kd z$6>}#q^n{4vN@+Os$m7KV+`}c%4)4pv@06af4-x5#wj!KKb%caK{A&Y#Rfs z-po?Dcb1({W=6FKIUirH&(yg=*6aLCekcKwyfK^JN5{wcA3nhO(o}SK#!CINhI`-I z1)6&n7O&ZmyFMuNwvEic#IiOAwNkR=u5it{B9n2sAJV5pNhar=j5`*N!Na;c7g!l$ z3aYBqUkqqTJ=Re-;)s!EOeij=7SQZ3Hq}ZRds%IM*PtM$wV z@;rlc*NRK7i3y5BETSKuumEN`Xu_8GP1Ri=OKQ$@I^ko8>H6)4rjiG5{VBM>B|%`&&s^)jS|-_95&yc=GqjNo{zFkw%%HHhS~e=s zD#sfS+-?*t|J!+ozP6KvtOl!R)@@-z24}`9{QaVLD^9VCSR2b`b!KC#o;Ki<+wXB6 zx3&O0LOWcg4&rv4QG0)4yb}7BFSEg~=IR5#ZRj8kg}dS7_V&^%#Do==#`u zpy6{ox?jWuR(;pg+f@mT>#HGWHAJRRDDDv~@(IDw&R>9643kK#HN`!1vBJHnC+RM&yIh8{gG2q zA%e*U3|N0XSRa~oX-3EAneep)@{h2vvd3Xvy$7og(sayr@95+e6~Xvi1tUqnIxoIH zVWo*OwYElb#uyW{Imam6f2rGbjR!Y3`#gPqkv57dB6K^wRGxc9B(t|aYDGS=m$&S!NmCtrMMaUg(c zc2qC=2Z`EEFMW-me5B)24AqF*bV5Dr-M5ig(l-WPS%CgaPzs6p_gnCIvTJ=Y<6!gT zVt@AfYCzjjsMEGi=rDQHo0yc;HqoRNnNFeWZgcm?f;cp(6CNylj36DoL(?TS7eU#+ z7&mfr#y))+CJOXQKUMZ7QIdS9@#-}7y2K1{8)cCt0~-X0O!O?Qx#E4Og+;A2SjalQ zs7r?qn0H044=sDN$SRG$arw~n=+T_DNdSrarmu)V6@|?1-ZB#hRn`uilTGPJ@fqEy zGt(f0B+^JDP&f=r{#Y_wi#AVDf-y!RIXU^0jXsFpf>=Ji*TeqSY!H~AMbJdCGLhC) zn7Rx+sXw6uYj;WRYrLd^5IZq@6JI1C^YkgnedZEYy<&4(z%Q$5yv#Boo{AH8n$a zhb4Y3PWdr269&?V%uI$xMcUrMzl=;w<_nm*qr=c3Rl@i5wWB;e-`t7D&c-mcQl7x! zZWB`UGcw=Y2=}~wzrfLx=uet<;m3~=8I~ZRuzvMQUQdr+yTV|ATf1Uuomr__nDf=X zZ3WYJtHp_ri(}SQAPjv+Y+0=fH4krOP@S&=zZ-t1jW1o@}z;xk8 z(Nz1co&El^HK^NrhVHa-_;&88vTU>_J33=%{if;BEY*J#1n59=07jrGQ#IP>@u#3A z;!q+E1Rj3ZJ+!4bq9F8PXJ@yMgZL;>&gYA0%_Kbi8?S=XGM~dnQZQ!yBSgcZhY96H zrWnU;k)qy`rX&&xlDyA%(a1Hhi5CWkmg(`Gb%m(HKi-7Z!LKGRP_B8@`7&hdDy5n= z`OIxqxiVfX@OX1p(mQu>0Ai*v_cTMiw4qRt3~NBvr9oBy0)r>w3p~V0SCm=An6@3n)>@z!|o-$HvDK z|3D2ZMJkLE5loMKl6R^ez@Zz%S$&mbeoqH5`Bb){Ei21q&VP)hWS2tjShfFtGE+$z zzCR$P#uktu+#!w)cX!lWN1XU%K-r=s{|j?)Akf@q#3b#{6cZCuJ~gCxuMXRmI$nGtnH+-h z+GEi!*X=AP<|fG`1>MBdTb?28JYc=fGvAi2I<$B(rs$;eoJCyR6_bc~p!XR@O-+sD z=eH`-ye})I5ic1eL~TDmtfJ|8`0VJ*Yr=hNCd)G1p2MMz4C3^Mj?7;!w|Ly%JqmuW zlIEW^Ft%z?*|fpXda>Jr^1noFZEwFgVV%|*XhH@acv8rdGxeEX{M$(vG{Zw+x(ei@ zmfXb22}8-?Fi`vo-YVrTH*C?a8%M=Hv9MqVH7H^J$KsD?>!SFZ;ZsvnHr_gn=7acz z#W?0eCdVhVMWN12VV^$>WlQ?f;P^{(&pYTops|btm6aj>_Uz+hqpGwB)vWp0Cf5y< zft8-je~nn?W11plq}N)4A{l8I7$!ks_x$PXW-2XaRFswX_BnF{R#6YIwMhAgd5F9X zGmwdadS6(a^fjHtXg8=l?Rc0Sm%hk6E9!5cLVloEy4eh(=FwgP`)~I^5~pBEWo+F6 zSf2ncyMurJN91#cJTy_u8Y}@%!bq1RkGC~-bV@SXRd4F{R-*V`bS+6;W5vZ(&+I<9$;-V|eNfLa5n-6% z2(}&uGRF;p92eS*sE*oR$@pexaqr*meB)VhmIg@h{uzkk$9~qh#cHhw#>O%)b@+(| z^IQgqzuj~Sk(J;swEM-3TrJAPCq9k^^^`q{IItKBRXYe}e0Tdr=Huf7da3$l4PdpwWDop%^}n;dD#K4s#DYA8SHZ z&1!riV4W4R7R#C))JH1~axJ)RYnM$$lIR%6fIVA@zV{XVyx}C+a-Dt8Y9M)^KU0+H zR4IUb2CJ{Hg>CuaXtD50jB(_Tcx=Z$^WYu2u5kubqmwp%drJ6 z?Fo40g!Qd<-l=TQxqHEOuPX0;^z7iX?Ke^a%XT<13TA^5`4Xcw6D@Ur&VT&CUe0d} z1GjOVF1^L@>O)l@?bD~$wzgf(nxX1OGD8fEV?TdJcZc2KoUe|oP1#=$$7ee|xbY)A zDZq+cuTpc(fFdj^=!;{k03C69lMQ(|>uhRfRu%+!k&YOi-3|1QKB z z?n?eq1XP>p-IM$Z^C;2L3itnbJZAip*Zo0aw2bs8@(s^~*8T9go!%dHcAz2lM;`yp zD=7&xjFV$S&5uDaiScyD?B-i1ze`+CoRtz`Wn+Zl&#s4&}MO{@N!ufrzjG$B79)Y2d3tBk&)TxUTw@QS0TEL_?njX|@vq?Uz(nBFK5Pq7*xj#u*R&i|?7+6# z+|r_n#SW&LXhtheZdah{ZVoqwyT{D>MC3nkFF#N)xLi{p7J1jXlmVeb;cP5?e(=f# zuT7fvjSbjS781v?7{)-X3*?>tq?)Yd)~|1{BDS(pqC zC}~H#WXlkUW*H5CDOo<)#x7%RY)A;ShGhI5s*#cRDA8YgqG(HeKDx+#(ZQ?386dv! zlXCO)w91~Vw4AmOcATuV653fa9R$fyK8ul%rG z-wfS zihugoZyr38Im?Zuh6@RcF~t1anQu7>#lPpb#}4cOA!EM11`%f*07RqOVkmX{p~KJ9 z^zP;K#|)$`^Rb{rnHGH{~>1(fawV0*Z#)}M`m8-?ZJV<+e}s9wE# z)l&az?w^5{)`S(%MRzxdNqrs1n*-=jS^_jqE*5XDrA0+VE`5^*p3CuM<&dZEeCjoz zR;uu_H9ZPZV|fQq`Cyw4nscrVwi!fE6ciMmX$!_hN7uF;jjKG)d2@aC4ropY)8etW=xJvni)8eHi`H$%#zn^WJ5NLc-rqk|u&&4Z6fD_m&JfSI1Bvb?b<*n&sfl0^t z=HnmRl`XrFvMKB%9}>PaA`m-fK6a0(8=qPkWS5bb4=v?XcWi&hRY?O5HdulRi4?fN zlsJ*N-0Qw+Yic@s0(2uy%F@ib;GjXt01Fmx5XbRo6+n|pP(&nodMoap^z{~q ziEeaUT@Mxe3vJSfI6?uLND(CNr=#^W<1b}jzW58bIfyWTDle$mmS(|x-0|2UlX+9k zQ^EX7Nw}?EzVoBfT(-LT|=9N@^hcn-_p&sqG z&*oVs2JSU+N4ZD`FhCAWaS;>|wH2G*Id|?pa#@>tyxX`+4HyIArWDvVrX)2WAOQff z0qyHu&-S@i^MS-+j--!pr4fPBj~_8({~e1bfcl0wI1kaoN>mJL6KUPQm5N7lB(ui1 zE-o%kq)&djzWJ}ob<-GfDlkB;F31j-VHKvQUGQ3sp`CwyGJk_i!y^sD0fqC@$9|jO zOqN!r!8-p==F@ZVP=U$qSpY(gQ0)59P1&t@y?5rvg<}E+GB}26NYPp4f2YFQrQtot5mn3wu_qprZ=>Ig-$ zbW26Ws~IgY>}^5w`vTB(G`PTZaDiGBo5o(tp)qli|NeV( z@H_=R8V39rt5J5YB2Ky?4eJJ#b`_iBe2ot~6%7mLt5t8Vwi^Jy7|jWXqa3amOIoRb zOr}WVFP--DsS`1WpN%~)t3R!arKF^Q$e12KEqU36AWwnCBICpH4XCsfnyrHr>$I$4 z!DpKX$OKLWarN7nv@!uIA+~RNO)l$$w}p(;b>mx8pwYvu;dD_unryX_NhT8*Tj>BTrTTL&!?O+%Rv;b?B??gSzdp?6Uug9{ zd@V08Z$BdI?fpoCS$)t4mg4rT8Q_I}h`0d-vYZ^|dOB*Q^S|xqTV*vIg?@fVFSmMpaw0qtTRbx} z({Pg?#{2`sc9)M5N$*N|4;^t$+QP?#mov zGVC@I*lBVrOU-%2y!7%)fAKjpEFsgQc4{amtiHb95KQEwvf<(3T<9-Zm$xIew#P22 zc2Ix|App^>v6(3L_MCU0d3W##AB0M~3D00EWoKZqsJYT(#@w$Y_H7G22M~ApVFTRHMI_3be)Lkn#0F*V8Pq zc}`Cjy$bE;FJ6H7p=0y#R>`}-m4(0F>%@P|?7fx{=R^uFdISRnZ2W_xQhD{YuR3t< z{6yxu=4~JkeA;|(J6_nv#>Nvs&FuLA&PW^he@t(UwFFE8)|a!R{`E`K`i^ZnyE4$k z;(749Ix|oi$c3QbEJ3b~D_kQsPz~fIUKym($a_7dJ?o+40*OLl^{=&oq$<#Q(yyrp z{J-FAniyAw9tPbe&IhQ|a`DqFTVQGQ&Gq3!C2==4x{6EJwiPZ8zub-iXoUtkJiG{} zPaR&}_fn8_z~(=;5lD-aPWD3z8PZS@AaUiomF!G8I}Mf>e~0g#BelA-5#`cj;O5>N Xviia!U7SGha1wx#SCgwmn*{w2TRX*I literal 0 HcmV?d00001 diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py new file mode 100644 index 00000000..7bdf742c --- /dev/null +++ b/document_ocr/tests/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import test_document_ocr diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py new file mode 100644 index 00000000..7dccb672 --- /dev/null +++ b/document_ocr/tests/test_document_ocr.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from PIL import Image, ImageDraw, ImageFont +from StringIO import StringIO +from openerp.tests.common import TransactionCase +from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE + + +class TestDocumentOcr(TransactionCase): + def test_document_ocr(self): + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'True') + test_image = Image.new('RGB', (200, 30)) + draw = ImageDraw.Draw(test_image) + draw.text((3, 3), "Hello world", font=ImageFont.truetype( + '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24)) + # test a plain image + data = StringIO() + test_image.save(data, 'png') + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.png', None) + self.assertEqual(result[1].strip(), 'Hello world') + # should also work for pdfs + data = StringIO() + test_image.save(data, 'pdf', resolution=300) + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.pdf', None) + self.assertEqual(result[1].strip(), 'Hello world') + # check cron + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'False') + attachment = self.env['ir.attachment'].create({ + 'name': 'testattachment', + 'datas': data.getvalue().encode('base64'), + }) + self.assertEqual(attachment.index_content, _MARKER_PHRASE) + attachment._ocr_cron() + self.assertEqual(attachment.index_content.strip(), 'Hello world') + # and for an unreadable image, we expect an error + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'True') + data = StringIO() + test_image = Image.new('1', (200, 30)) + test_image.save(data, 'Palm') + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.palm', None) + self.assertEqual(result[1], None)