From 1705cefe6b31b45819ab492e9b477bf87d0f2a74 Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Tue, 30 May 2017 15:54:45 +0100
Subject: [PATCH 01/16] [MIG] Migrate document-ocr to v10.0

---
 document_ocr/README.rst                   |  86 ++++++++++++++++++++++
 document_ocr/__init__.py                  |   4 +
 document_ocr/__manifest__.py              |  23 ++++++
 document_ocr/data/ir_config_parameter.xml |  13 ++++
 document_ocr/data/ir_cron.xml             |  13 ++++
 document_ocr/models/__init__.py           |   4 +
 document_ocr/models/ir_attachment.py      |  85 +++++++++++++++++++++
 document_ocr/static/description/icon.png  | Bin 0 -> 9455 bytes
 document_ocr/tests/__init__.py            |   4 +
 document_ocr/tests/test_document_ocr.py   |  49 ++++++++++++
 10 files changed, 281 insertions(+)
 create mode 100644 document_ocr/README.rst
 create mode 100644 document_ocr/__init__.py
 create mode 100644 document_ocr/__manifest__.py
 create mode 100644 document_ocr/data/ir_config_parameter.xml
 create mode 100644 document_ocr/data/ir_cron.xml
 create mode 100644 document_ocr/models/__init__.py
 create mode 100644 document_ocr/models/ir_attachment.py
 create mode 100644 document_ocr/static/description/icon.png
 create mode 100644 document_ocr/tests/__init__.py
 create mode 100644 document_ocr/tests/test_document_ocr.py

diff --git a/document_ocr/README.rst b/document_ocr/README.rst
new file mode 100644
index 00000000..7f9c3b28
--- /dev/null
+++ b/document_ocr/README.rst
@@ -0,0 +1,86 @@
+.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
+    :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
+    :alt: License: AGPL-3
+
+=================
+OCR for documents
+=================
+
+This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
+
+It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
+
+Installation
+============
+
+To install this module, you need to:
+
+#. install tesseract and the language(s) your documents use
+#. if you want to support OCR on PDFs, install imagemagick
+#. install the module itself
+
+On an Debian or Ubuntu system you would typically run::
+
+    $ sudo apt-get install tesseract-ocr imagemagick
+
+
+Configuration
+=============
+
+To configure this module, go to:
+
+#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.*
+
+Usage
+=====
+
+By default, character recognition is done asynchronously by a cronjob at night. 
+This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
+The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
+In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
+
+.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
+    :alt: Try me on Runbot
+    :target: https://runbot.odoo-community.org/runbot/118/10.0
+
+Bug Tracker
+===========
+
+Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
+In case of trouble, please check there if your issue has already been reported.
+If you spotted it first, help us smashing it by providing a detailed and welcomed feedback.
+
+Credits
+=======
+
+The actual work
+---------------
+
+* `tesseract <https://github.com/tesseract-ocr>`_
+
+Images
+------
+
+* Odoo Community Association: `Icon <https://github.com/OCA/maintainer-tools/blob/master/template/module/static/description/icon.svg>`_.
+
+Contributors
+------------
+
+* Holger Brunn <hbrunn@therp.nl>  
+
+Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list <mailto:community@mail.odoo.com>`_ or the `appropriate specialized mailinglist <https://odoo-community.org/groups>`_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues.
+
+Maintainer
+----------
+
+.. image:: https://odoo-community.org/logo.png
+   :alt: Odoo Community Association
+   :target: https://odoo-community.org
+
+This module is maintained by the OCA.
+
+OCA, or the Odoo Community Association, is a nonprofit organization whose
+mission is to support the collaborative development of Odoo features and
+promote its widespread use.
+
+To contribute to this module, please visit https://odoo-community.org.
diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py
new file mode 100644
index 00000000..7eda98a2
--- /dev/null
+++ b/document_ocr/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import models
diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py
new file mode 100644
index 00000000..382e77d6
--- /dev/null
+++ b/document_ocr/__manifest__.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+{
+    "name": "OCR for documents",
+    "version": "10.0.1.0.0",
+    "author": "Therp BV,Odoo Community Association (OCA), TKO Brasil",
+    "license": "AGPL-3",
+    "category": "Knowledge Management",
+    "summary": "Run character recognition on uploaded files",
+    "depends": [
+        'document',
+    ],
+    "data": [
+        "data/ir_cron.xml",
+        "data/ir_config_parameter.xml",
+    ],
+    "external_dependencies": {
+        'bin': [
+            'tesseract',
+        ],
+    },
+}
diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml
new file mode 100644
index 00000000..e46db18a
--- /dev/null
+++ b/document_ocr/data/ir_config_parameter.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<openerp>
+    <data noupdate="1">
+        <record id="param_synchronous" model="ir.config_parameter">
+            <field name="key">document_ocr.synchronous</field>
+            <field name="value">False</field>
+        </record>
+        <record id="param_dpi" model="ir.config_parameter">
+            <field name="key">document_ocr.dpi</field>
+            <field name="value">300</field>
+        </record>
+    </data>
+</openerp>
diff --git a/document_ocr/data/ir_cron.xml b/document_ocr/data/ir_cron.xml
new file mode 100644
index 00000000..f69d151a
--- /dev/null
+++ b/document_ocr/data/ir_cron.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<openerp>
+    <data noupdate="1">
+        <record id="cron" model="ir.cron">
+            <field name="name">Run OCR on uploaded documents</field>
+            <field name="interval_type">days</field>
+            <field name="interval_number">1</field>
+            <field name="model">ir.attachment</field>
+            <field name="function">_ocr_cron</field>
+            <field name="numbercall">-1</field>
+        </record>
+    </data>
+</openerp>
diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py
new file mode 100644
index 00000000..a15f1b21
--- /dev/null
+++ b/document_ocr/models/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import ir_attachment
diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py
new file mode 100644
index 00000000..b27992c8
--- /dev/null
+++ b/document_ocr/models/ir_attachment.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+import logging
+import subprocess
+from StringIO import StringIO
+
+from PIL import Image
+from openerp import api, models
+
+_logger = logging.getLogger(__name__)
+_MARKER_PHRASE = '[[waiting for OCR]]'
+
+
+class IrAttachment(models.Model):
+    _inherit = 'ir.attachment'
+
+    @api.model
+    def _index(self, data, datas_fname, file_type):
+        mimetype, content = super(IrAttachment, self)._index(
+            data, datas_fname, file_type)
+        if not content or content == 'image':
+            has_synchr_param = self.env['ir.config_parameter'].get_param(
+                'document_ocr.synchronous', 'False') == 'True'
+            has_force_flag = self.env.context.get('document_ocr_force')
+            if has_synchr_param or has_force_flag:
+                content = self._index_ocr(mimetype, data, datas_fname,
+                                          file_type)
+            else:
+                content = _MARKER_PHRASE
+
+        return mimetype, content
+
+    @api.model
+    def _index_ocr(self, mimetype, data, datas_fname, file_type):
+        dpi = int(
+            self.env['ir.config_parameter'].get_param(
+                'document_ocr.dpi', '500'))
+        top_type, sub_type = mimetype.split('/', 1)
+        if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
+            image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
+                data, datas_fname, file_type, dpi)
+        else:
+            image_data = StringIO()
+            try:
+                Image.open(StringIO(data)).save(image_data, 'tiff',
+                                                dpi=(dpi, dpi))
+            except IOError:
+                _logger.exception('Failed to OCR image')
+                return None
+        process = subprocess.Popen(
+            ['tesseract', 'stdin', 'stdout'],
+            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = process.communicate(image_data.getvalue())
+        if stderr:
+            _logger.error('Error during OCR: %s', stderr)
+        return stdout
+
+    @api.model
+    def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
+        process = subprocess.Popen(
+            ['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
+            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = process.communicate(data)
+        if stderr:
+            _logger.error('Error converting to PDF: %s', stderr)
+        return StringIO(stdout)
+
+    @api.model
+    def _ocr_cron(self):
+        for this in self.with_context(document_ocr_force=True).search([
+            ('index_content', '=', _MARKER_PHRASE),
+        ]):
+            if not this.datas:
+                continue
+            file_type, index_content = this._index(
+                this.datas.decode('base64'), this.datas_fname, this.file_type)
+            this.write({
+                'file_type': file_type,
+                'index_content': index_content,
+            })
diff --git a/document_ocr/static/description/icon.png b/document_ocr/static/description/icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..3a0328b516c4980e8e44cdb63fd945757ddd132d
GIT binary patch
literal 9455
zcmW++2RxMjAAjx~&dlBk9S+%}OXg)AGE&Cb*&}<C%<R2Kc9faym6aW`f0Dh5$js*d
z_}}Z!;XIG;_cPz`_vag-p{7Ve$Uq1H00~A(?iu(VaQlMefnU3&OozZXm@69d91cGG
z;O61r&je0NdamH#&)mKsXk?Zb_)B^>d0jUxM@u(PQx^-s)6<jB#=*|j%+$$(&(Xyy
zYgd8+09XKwoa}S2?48%%ZU#L~n^g{fE40h%YEx5by;GuJ(K|vI1uO;0m}=J7R4@Bs
z>97TX<v{QF=s?r`z`m$a0ZvrhBU7}{15RN9M`j!kv_Mp+3~{||YNuFzHNvhMYm3=Q
zo!q+OJ5(%-oNM^I^M%*luKMiVL`lo`bcKGymX7i3MIFWj1i|9+CGNvn`cu-RsK0Qh
zoYlwB>`ehR4?GS^qbkof1cslKgk<Uw6DeIxZyT_?=OwX|lwC*A?ac*gHF7jmS080$
z>U)h65qZ9Oc=ml_0temigYLJfnz{IDzUf>bGs4N!v3=Z3jMq&A#7%rM5eQ#dc?k~!
zVpnB`o+K7|Al`Q_U<UrcmRnh6jExs}l(hZs0%T~Dnpu-NENdhioRv(T{Qmv>;eD$B
zfJtP*jH`siUq~{KE)`jP2|#TUEFGRryE2`i0**z#*^6~AI|YzIWy$Cu#CSLW3q=GA
z6`?GZymC;dCPk~rBS%eCb`5OLr;RUZ;D`}um=H)BfVIq%7VhiMr)_#G0N#zrNH|__
zc+blN2UAB0=617@>_<D4$IPL<k1zq(*VmlDPer&h1n6`AG;9A!_W=N$JthhQWXZ<i
zGURc6f<i*joK3xSWaOOXxAc8ES>u;MPHN;P;N#YoE=)R#i$k_`UAA>WWCcEVMh~L_
zj--gtp&|K1#58Yz*AHCTMziU1Jzt_jG0I@qAOHsk$2}yTmVkBp_eHuY$A9)>P6o~I
z%aQ?!(GqeQ-Y+b0I(m9pwgi(IIZZzsbMv+9w{PFtd_<_(LA~0H(xz<Z(Qt1jC2cC|
z6WbMo9YgON{L#ZDl$sV4*<CP(>{=FhLB@(1&qHA5EJw1>>=%q2f&^X>IQ{!GJ4e9U
z&KlB)z(84HmNgm2hg2C0>WM{E(DdPr+EeU_N@57;PC2&DmGFW_9kP&%?X4}+xWi)(
z;)z%wI5>D4a*5XwD)P--sPkoY(a~WBw;E~AW`Yue4kFa^LM3X`8x|}ZUeMnqr}>kH
zG%WWW>3ml$Yez?i%)2pbKPI7?5o?hydokgQyZsNEr{a|mLdt;X2TX(#B1j35xPnPW
z*bMSSOauW>o;*=kO8ojw91VX!qoOQb)zHJ!odWB}d+*K?#sY_jqPdg{Sm2HdYzdEx
zOGVPhVRTGPtv0o}RfVP;Nd(|CB)<HrC1(%ZOEd8PI?r9b_$Cp-${bh59Z_R7n&YCp
zl8lfMpes*8{FVm;oJH@0LLoWmxD59u?JwHz2iRrAaE0`Hc&g;t5~$P*kdeOZn9OJ3
zRczo@ZkWU)RG+hcfH~{49Vvb3D(W0wRX#|$cA0Iqy^VR~(4hqA2AFI-rK!FM!#fJ_
zGFI@iqJOPXZ!=VjTS3dMuu~9xU3K1&?th;$)cqM#WGxbDEyB$y`#9j%>I;*t&QO8h
zFfekr30S!-LHmV_Su-W+rEwYXJ^;6&3|L$mMC8*bQptyOo9;>Qb9Q9`ySe3%V$A*9
zeKEe+b0{#KWGp$F+tga)0RtI)nhMa-K@JS}2krK~n8vJ=Ngm?R!9G<~RyuU0d?nz#
z-5EK$o(!F?hmX*2Yt6+coY`6jGbb7t<dboft~zeDZwK=+l2bFWs5^+|r{J6GO9Cwl
z&SW58BRs?XdFLuhtzl7WLbQ#~z#`jAB3AbSUg6jW6@qVd2Q~9qN(izT1y*>F#6nHA
zuKk=GGJ;ZwON1iAfG$E#Y7MnZVmrY|j0eVI(DN_MNFJmyZ|;w4tf@=CCDZ#5N_0K=
z$;R~bbk?}TpfDjfB&aiQ$VA}s?P}xPERJG{kxk5~R`iRS(SK5d+Xs9swCozZISbnS
zk!)I0>t=A<-^z(cmSFz3=jZ23u13X><0b)P)^1T_))Kr`e!-pb#q&J*Q`p+B6la%C
zuVl&0duN<;uOsB3%T9Fp8t{ED108<?)`y_~Hnd9AUX7h-H?jVuU|}My+C=TjH(jKz
zqMVr0re3S$H@t{zI95qa)+Crz*5Zj}Ao%4Z><+W(nOZd?gDnfNBC3>M8WE61$So|P
zVvqH0SNtDTcs<xiU1;=a39$d&&l5EwoIH1db#`S9Kw!YC1jhR)VbCWMptlUUkpfif
zMzi-i8)WL?|C$*Q?iqbS{w<lA9XN(rD)VS<zn>UdzaMDpT=Ty0pDHHNL@Z0w$Y`XO
z2M-_r1S+GaH%pz#Uy0*w$Vdl=X=rQXEzO}d6J^R6zjM1u&c9vYLvLp?W7w(?np9x1
zE_0JSAJCPB%i7p*Wvg)pn5T`8k3-uR?*NT|J`eS#_#54p>!p(mLDvmc-3o0mX*mp_
zN*AeS<>#^-{S%W<*mz^!X$w_2dHWpcJ6^j64qFBft-o}o_Vx80o0>}Du;>kLts;$8
zC`7q$QI(dKYG`Wa8#wl@V4jVWBRGQ@1dr-hstpQL)Tl+aqVpGpbSfN>5i&QMXfiZ>
zaA?T1VGe?rpQ@;+pkrVdd{klI&jVS@I5_iz!=UMpTsa~mBga?1r}a<Xwa$pLotZk<
zsykXwQO37I=aXew7x=}YWiW)^p)|C#g+)an!@j?EcY8C0t)BlK#y{YLtkJ6=D6H-5
zp2*ANf@>RBm1WS;TT*s0f0lY=JBl66Upy)-k4J}lh=P^8(SXk~0xW=T9v*B|gzIhN
z>qsO7dFd~mgxAy4V?&)=5ieYq?zi?ZEoj)&2o)RLy=<bK!vY7J*RP!&i_xU}i*o6o
z?xN*1<rEe1L2HBkCSgiYM3a|4w?Bo7CJL7?Eh;9An1m$1t?h1v92<Xg2(#)bjbL|o
zH_H0}!Og=1dOBynXHu>@hbCRcfT5ji<pmK_U*~T(A$I-*rM$8-BCv{=@=7F)2pdtU
z5==tp!}A*&Xgf{F>gwtQGE{L*8<@Yd{zg;CsL5mvzfDY}P-wos_6PfprFVaeqNE%h
zKZhLtcQld;ZD+>=nqN~>GvROfueSzJD&<KAVm#0W>BE*}XfU|H&(FssBqY=hPCt`d
zH?@s2>I(|;fcW&YM6#V<T#ysvE$@4oRO>#!kUIP8$Nkdh0A(bEVj``-AAyYgwY~jB
zT|I7Bf@%;7aL7Wf4dZ%VqF$eiaC38OV6oy3Z#TER2G+fOCd9Iaoy6aLYbPTN{XRPz
z;U!V|vBf%H!}52L2gH_+j;`bTcQRXB+y9onc^wLm5wi3-Be}U>k_u>2Eg$=k!(l@I
zcCg+flakT2Nej3i0yn+g+}%NYb?ta;R<sv-KjYb}UVFjITQ)VAEWu*)Lz7*-f^A*H
z<25#Ynt~-Qh?$wWx4$1=T2`j@bKp!)3IXh(LC@)%WGW$+j(tGz(6#bGw&K0j=Np@B
zt}@t$R`z(>?(g5SnwsQ49U8Wng8d|{B+lyRcEDvR3+`O{zfmrmvFrL6acVP%yG98X
zo&+VBg@px@i)%o?dG(`T;n*$S5*rnyiR#=wW}}GsAcfyQpE|>a{=$Hjg=-*_K;UtD
z<sX7(ZJc+Q;#xP8uk`jnF@a^|TWw*55if6@@}%7lOBaGo9Ia-e?`RPQc^w^EWfhe}
zHWHTvDA+r}Xf5Yqpr;QU-88%QC9F_>#z-)AXwSRY?<M%E84!g*1GC?E>OPefw^iI+
z)AXz#PfEjlwTes|_{sB?4(O@fg0AJ^g8gP}ex9Ucf*@_^J(s_5jJV}c)s$`Myn|Kd
z$<h)Fm>6>}#q^n{4vN@+Os$m7KV+`}c%4)4pv@06af4-x5#wj!KKb%caK{A&Y#Rfs
z-po?Dcb1({W=6FKIUirH&(yg=*6aLCekcKwyfK^JN5{wcA3nhO(o}SK#!CINhI`-I
z1)6&n7O&ZmyFMuNwvEic#IiOAwNkR=u5it{B9n2sAJV5pNhar=j5`*N!Na;c7g!l$
z3aYBqUkqqTJ=Re-;)s!EOeij=7SQZ3Hq}ZRds%IM*PtM$wV<GYik+VfZene%bm(n6
z`r@rc^TVw7EN{|O7rORMlw)zt(Z#enc3joE#IIk!M)L8gk^go9E9AxiP9o#OqmvV>
z@;rlc*NRK7i3y5BETSKuumEN`Xu_8<BQ)z0!-JuC8x}?$qo8SEko}oUWNI(4h*VHO
zAi!Egy!f(o9aHs2dhSE6ki(be*&w&+WLOB<(b3T_Hide(NvVvLQV{BN|2?UJFaY*@
z9CXA5B_*8i5Bf6sn~d`R>GP1Ri=OK<SGIb#BCTo3qI#UBUg+dkR+2ilUwIg%XFV;l
z+>Q$@I^ko8>H6)4rjiG5{VBM>B|%`&&s^)jS|-_95&yc=GqjNo{zFkw%%HHhS~e=s
zD#sfS+-?*t|J!+ozP6KvtOl!R)@@-z24}`9{QaVLD^9VCSR2b`b!KC#o;Ki<+wXB6
zx3&O0LOWcg4&rv4QG0)4yb}7BFSEg~=IR5<g6yi=XozU}zReXL{rA%c`$IQAs<ObZ
zep*ITZ0C(~W*`?XH^l1t7&+qigAfY9tVJ5DH!~jY>#ZRj8kg}dS7_V&^%#Do==#`u
zpy6{ox?jWuR(;pg+f@mT>#HGWHAJRRDDDv~@(IDw&R>9643kK<aUZ^OhBu;1e6t#{
zxKehVgz`HT;A`FMivG<tq0OcrXwHUX_<$j<uk%m>#HN`!1vBJHnC+RM&yIh8{gG2q
zA%e*U3|N0XSRa~oX-3EAneep)@{h2vvd3Xvy$7og(sayr@95+e6~Xvi1tUqnIxoIH
zVWo*OwYElb#uyW{Imam6f2<eMTQj#)z8+N&ZY?tSPo%&2eVNU=4=?rlO<*9Twaxco
zEVE=Jh?_x>rGbjR!Y3`#gPqkv57dB6K^wRGxc9B(t|aYDGS=m$&S!NmCtrMMaUg(c
zc2qC=2Z`EEFMW-me5B)24AqF*bV5Dr-M5ig(l-WPS%CgaPzs6p_gnCIvTJ=Y<6!gT
zVt@AfYCzjjsMEGi=rDQHo0yc;HqoRNnNFeWZgcm?f;cp(6CNylj36DoL(?TS7eU#+
z7&mfr#y))+CJOXQKUMZ7QIdS9@#-}7y2K1{8)cCt0~-X0O!O?Qx#E4Og+;A2SjalQ
zs7r?qn0H044=sDN$SRG$arw~n=+T_DNdSrarmu)V6@|?1-ZB#hRn`uilTGPJ@fqEy
zGt(f0B+^JDP&f=r{#Y_wi#AVDf-y!RIXU^0jXsFpf>=Ji*TeqSY!H~AMbJdCGLhC)
zn7Rx+sXw6uYj;WRYrLd^5IZq@6JI1C^YkgnedZEYy<&4(z%Q$5yv#B<pkvWZft^C&
z;+zNo+VJNluxaDF!`gW+F0=MxsCQ~~#CYKa;ULfj5hTa8a6;d*(<eeQ7-ZQgz2et^
zf|7URfj;x*#HiF0wuBCOTEpbeD&kkENqolCm2#cQGHCeEC<&I3j+P6?sX?BPp4~46
zx-S~EJ>oo{AH8n$<d4loB?vL3RqXwK_COrQ6xRoWGOdFnWy(hhzrG8k;2k}8Zj&>a
zhb4Y3PWdr269&?V%uI$xMcUrMzl=;w<_nm*qr=c3Rl@i5wWB;e-`t7D&c-mcQl7x!
zZWB`UGcw=Y2=}~wzrfLx=uet<;m3~=8I~ZRuzvMQUQdr+yTV|ATf1Uuomr__nDf=X
zZ3WYJtHp_ri(}SQAPjv+Y+0=<GD??Na(2AG`s>fH4krOP@S&=zZ-t1jW1o@}z;xk8
z(Nz1co&El^HK^NrhVHa-_;&88vTU>_J33=%{if;BEY*J#1n59=07jrGQ#IP>@u#3A
z;!q+E1Rj3ZJ+!4bq9F8PXJ@yMgZL;>&gYA0%_Kbi8?S=XGM~dnQZQ!yBSgcZhY96H
zrWnU;k)qy`rX&&xlDyA%(a1Hhi5CWkmg(`Gb%m(HKi-7Z!LKGRP_B8@`7&hdDy5n=
z`OIxqxiVfX@OX1p(mQu>0Ai*v_cTMiw4qRt3~N<X{IYiJ3k=4u-u*nJEBnJ<OdI8P
zXmK`OYkO5a{YDhI3PL|H4m=p>Bvr9oBy0)r>w3p~V0SCm=An6@3n)>@z!|o-$HvDK
z|3D2ZMJkLE5loMKl6R^ez@Zz%S$&mbeoqH5`Bb){Ei21q&VP)hWS2tjShfFtGE+$z
zzCR$P#uktu+#!w)cX!<osOkId_HzAT-HD2N`M+v2l+zwdW%GgZbQMuhfE*hHjKXKg
z45hphqVETPDbX6wpTlZqza0gFaRGh`3L~0id)F6#%@9;w(e%PjzuD7@inuToA!B?F
zCMLJc!u{3N)s?lB-!11!GxXsCak8zQomO)gmn02f-5~OkMYnm~#jVwwYNw@LAv!KN
z-n`q1|AXw*TW>lWN1XU%K-r=s{|j?)Akf@q#3b#{6cZCuJ~gCxuMXRmI$nGtnH+-h
z+GEi!*X=AP<|fG`1>MBdTb?28JYc=fGvAi2I<$B(rs$;eoJCyR6_bc~p!XR@O-+sD
z=eH`-ye})I5ic1eL~TDmtfJ|8`0VJ*Yr=hNCd)G1p2MMz4C3^Mj?7;!w|Ly%JqmuW
zlIEW^Ft%z?*|fpXda>Jr^1noFZEwFgVV%|*XhH@acv8rdGxeEX{M$(vG{Zw+x(ei@
zmfXb22}8-?Fi`vo-YVrTH*C?a8%M=Hv9MqVH7H^J$KsD?>!SFZ;ZsvnHr_gn=7acz
z#W?0eCdVhVMWN12VV^$>WlQ?f;P^{(&pYTops|btm6aj>_Uz+hqpGwB)vWp0Cf5y<
zft8-je~nn?W11plq}N)4A{l8I7$!ks_x$PXW-2XaRFswX_BnF{R#6YIwMhAgd5F9X
zGmwdadS6(a^fjHtXg8=l?Rc0Sm%hk6E9!5cLVloEy4eh(=FwgP`)~I^5~pBEWo+F6
zSf2ncyMurJN91#cJTy_u8Y}@%!bq1RkGC~-bV@SXRd4F{R-*V<h953|jk-C&|3)z%
ze&_30-6q1)a0(!xXqA+_t(WC`HA_yYaW@>`bS+6;W5vZ(&+I<9$;-V|eNfLa5n-6%
z2(}&uGRF;p9<Q%jdNy1%e7XTzyu7EEQT(5LrnvX~T%K!IuDyHY`ZneVJu#k_1T)xA
z*yxB?y4!pOJ$DTZzBm|8OIQq+AtV25aJ+X5EJjAu><tfuiNClI2BSoMgqLnU-5t95
zEnZ(^g~1RgD=UMB({c;$HujG&%DqGF;5jI~roRT+(rOoS$6f6SsURIuAVkAeIVc~{
z5ZxLN%m%Z*Sg;qYCf3<$dE6~&w_!EBx%yl4YC~LH{FCFNRBc_I>2eS*sE*o<YSPrx
z{M(reV{~jKue#Y6ZOnqJia{fQc!UxV4m)l389OmzR6A3|2!i<P{r82jKw+zq4%@ny
znopi6g!1Vx9Bml#a~KdL2lp2C)qSTKIh43vgycQHfQb_I!kQY&p;X@Bz|{^SXsW1K
zP&Ag1CW_r6u5-4=s(W>R$@pexaqr*meB)VhmIg@h{uzkk$9~qh#cHhw#>O%)b@+(|
z^IQgqzuj~Sk(J;swEM-3TrJAPCq9k^^^`q{IItKBRXYe}e0Tdr=Huf7da3$l4<V=<
zfr<*%iOB1EY}w3tF2Cwl7a2OS&~Y-lu<s)T^9=OFU8(CeN|63BiMxf*)5gesGU<eZ
z9DigzL3+quZ1o2T<KAZbCGJx&lrl*e#}Dpv24bZO$B<Yoc5hbeP0y?@P%|Tvw||e%
zV#e^q0D2R_Oq_c+=x5vP&hg4k+df{o7$aNZCM>PdpwWDop%^}n;dD#K4s#DYA8SHZ
z&1!riV4W4R7R#C))JH1~axJ)RYnM$$lIR%6fIVA@zV{XVyx}C+a-Dt8Y9M)^KU0+H
zR4IUb2CJ{Hg>CuaXtD50jB(_Tcx=Z$^W<wsNa}nUDpNk$q{>Yu2u5kubqmwp%drJ6
z?Fo40g!Qd<-l=TQxqHEOuPX0;^z7iX?Ke^a%XT<13TA^5`4Xcw6D@Ur&VT&CUe0d}
z1GjOVF1^L@>O)l@?bD~$wzgf(nxX1OGD8fEV?TdJcZc2KoUe|oP1#=$$7ee|xbY)A
zDZq+cuTpc(fFdj^=!;{k03C69lMQ(|>uhRfRu%+!k&<F<Z7l=VnwB^RA&^APMi=Tn
zNc}%IJL~xB4OP6bJNwAw)~Ma2=UP0dhr%X=kco(it+@F<#$xrIJ8@|{Buh<)h`xg?
z_U}pe=3#zmDfdqE4`Hyn<!?&CfCp#GTeXo8;AYd1Opd&cXER8cBSOF3iFJ#XPgQVp
zZSiQf7V^Dt?ITs8aMF~e1hq0P@^oB)#byh|6q_7F1B%ST%WL~J?ySn>YOi-3|1QKB
z<?_cUy)V3go%%^l)lRa=dmY_XQG3Z{Q?52d$qG}vIe|EZbYEtrR$rh(iS)O#dU-(>
z?n?eq1XP>p-IM$Z^C;2L3itnbJZAip*Zo0aw2bs8@(s^~*8T9go!%dHcAz2lM;`yp
zD=7&xjFV$S&5uDaiScyD?B-i1ze`+CoRtz`Wn+Zl&#<i<Zx(`=7k~{%`w&<EbG}U<
z0%E^8lyw>s4&}MO{@N!ufrzjG$B79)Y2d3tBk&)TxUTw@<TSeYlROCt(gU?O((-qu
zs>QS0TEL_?njX|<LXnSC4TlId(D4W|GQ?L@$3Ue@N8p=l9-sC<=z(lPk;^sT(v2*k
zc~vp#Hp`mXjzhml2NMCh^h5)sqsan+jJ_l<a4w|G&ac02hrz8#6|kFraA~rta0}!q
zB4BE{QWY7MtxHn_xB);Avf#Lf<GG<A?Q3JVyc1p8^H}$8(Gn=_&F1!m8$sKyeue+L
z5&392wm+wO;_oXoTEJ=wxVXk}dt<d~CgUUMW_PPTe(c<7n18#mVaX)vK}-PzZq7<b
zNJbVTFaq(8ZLx|A*G$H3ugT2aVziDEGa66FVt@hr1|D{RWN6yuqlglM!rp^YG;UpG
zrnm?ek079l3VoOU^p%f^A9Yzn8IVX?^rB4LbgJ|PONhzM_0{P?S(V$OI=u5IBjfT#
z0ntLLnhg5$0fAHJaG6HCx4X7;RmvMtE}8C>@vq?Uz(nBFK5Pq7*xj#u*R&i|?7+6#
z+|r_n#SW&LXhtheZdah{ZVoqwyT{D>MC3nkFF#N)xLi{p7J1jXlmVeb;cP5?e(=f#
zuT7fv<Q3o5LspCx-RPkzLw{VsDE>jSbjS781v?7{)-X3*?>tq?)Yd)~|1{BDS(pqC
zC}~H#WXlkUW*H5CDOo<)#x7%RY)A;ShGhI5s*#cRDA8YgqG(HeKDx+#(ZQ?386dv!
zlXCO)w91~Vw4AmOcATuV653fa9R$fyK8<JV*@W33SMHM;w!OmUXar{O;_8j>ul%rG
z-<zwGD)!Y{+(T{%ob~79zpXX%zulyiy1_UHWvu@YqxKAK3e9GO6Os4R0Naujn>wfS
zihugoZyr38Im?Zuh6@RcF~t1anQu7>#lPpb#}4cOA!EM11`%f*07RqOVkmX{p~KJ9
z^zP;K#|)$`^Rb{rnH<AdQ^(;gQMcF>GH{~>1(fawV0*Z#)}M`m8-?ZJV<+e}s9wE#
z)l&az?w^5{)`S(%MRzxdNqrs1n*-=jS^_jqE*5XDrA0+VE`5^*p3CuM<&dZEeCjoz
zR;uu_H9ZPZV|fQq`Cyw4nscrVwi!fE6ciMmX$!_hN7uF;jjKG)d2@aC4ropY)8<!P
zAOHj?oPbjQ%hh|vE_1IMB)43eQpeLi&)R>etW=xJvni)8eHi`H$%#zn^WJ<U7gogJ
z?A^!2J~rI78JH|Ml2EldmKY5K8;Zx(FGcBdM<5oe#G+nd6dObqz@Af%%M*aBE_pn8
zXQo2`Bw*IwvP3#1Ev<%YocpBodO9+Rw~`5*CY3{L;qf1PxV!r%NI+#Q1f8GU7$~#U
zAA9$4&g-k=8BYi*3i@0^UX}nTQVyOf)8T(}G^Ti?Vqvk~bJRR#EAQ?u`dClPxk@{;
zxvO?%jSX`2{8|^z0*C5DR1Z^>5NLc-rqk|u&&4Z6fD_m&JfSI1Bvb?b<*n&sfl0^t
z=HnmRl`XrFvMKB%9}>PaA`m-fK6a0(8=qPkWS5bb4=v?XcWi&hRY?O5HdulRi4?fN
zlsJ*N-0Qw+Yic@s0(2uy%F@ib;GjXt01F<S%GT3P{Ck&Y*^gWuie`o_g+ZNDNIV|F
z)zEe|Ij*4$H1(<RLz0($;AD3VsUJwI@liw^?fh(V?VC`Sz7h`*Q<VYlhbHJ?&h+!W
zAJC)U;Lx_QRaSNFYbyi|7+MeNTgA+Znh}qFzf>mx5XbRo6+n|pP(&nodMoap^z{~q
ziEeaUT@Mxe3vJSfI6?uLND(CNr=#^W<1b}jzW58bIfyWTDle$mmS(|x-0|2UlX+9k
zQ<WB5+u#`K#~J$81)#?BuTOKLk|+S>^EX7Nw}?EzVoBfT(-LT|=9N@^hcn-_p&sqG
z&*oVs2JSU+N4ZD`FhCAWaS;>|wH2G*Id|?pa#@>tyxX`+4HyIArWDvVrX)2WAOQff
z0qyHu&-S@i^MS-+j--!pr4fPBj~_8({~e1bfcl0wI1kaoN>mJL6KUPQm5N7lB(ui1
zE-o%kq)&djzWJ}ob<-GfDlkB;F31j-VHKvQUGQ3sp`CwyGJk_i!y^sD0fqC@$9|jO
zOqN!r!8-p==F@ZVP=U$qSpY(gQ0)59P1&t@y?5rvg<}E+GB}2<F#{*|k0E{$&_`~)
zkzDcsi#!7r<a8lPUCMj?{CK;e|9#-xj>6NYPp4f2YFQrQtot5mn3wu_qprZ=>Ig-$
zbW26Ws~IgY>}^5w`vTB(G`PTZaDiGBo5o(tp)qli|NeV(<Rzgqw(Ze!7hEGKKx((>
z@H_=R8V39rt5J5YB2Ky?4eJJ#b`_iBe2ot~6%7mLt5t8Vwi^Jy7|jWXqa3amOIoRb
zOr}WVFP--DsS`1WpN%~)t3R!arKF^Q$e12KEqU36AWwnCBICpH4XCsfnyrHr>$I$4
z!DpKX$OKLWarN7nv@!uIA+~RNO)l$$w}p(;b>mx8pwYvu;dD_unryX_N<mojSrG!`
zgkqx4t<XLL6ZUppvjeV9PJ6dG>hT8*Tj>BTrTTL&!?O+%Rv;b?B??gSzdp?6Uug9{
zd@V08Z$BdI?fpoCS$)t4mg4rT8Q_I}h`0d-vYZ^|dOB*Q^S|xqTV*<bTMtKO;_Z)R
zRSOJrd5TFO0aO%#%-sNa{`SiQ^$$1^@oUd&^lB{MKZ>vIg?@fVFSmMpaw0qtTRbx}
z({Pg?#{2`sc9)M5N$*N|4;^t$+Q<Wh^yJ?FzJ*$wiB{j;CMy+?m1MbsJo0ubR==f9
z>P?#mo<zt1E6=Ilm*nbmYmpxfAj7*m*Wh@=7|;!%(-pviW`nuCktLveet9^$*y^>v
zGVC@I*lBVrOU-%2y!7%)fAKjpEFsgQc4{amtiHb95KQEwvf<(3T<9-Zm$xIew#P22
zc2Ix|App^>v6(3L_MCU0d3W##AB<Fxl*nmny6_RsCu%1a)so}}uCXSzwY70Y@uTxK
z=5nu(N*2HDbrIcL)t_*{*84mvmV_Y9<vxHJJ;0gUdNjyW)jDb|@|j*qR8;gsMa5Ir
z02fHv=%xyN9VLv_ZLL4y|F*p$S^^T47m{aok5{rmM{$s7^Xu2!_fo1$IG6kkG_Tgx
zFfjPm3;g>0M~3D00EWoKZqsJYT(#@w$Y_H7G22M~ApVFTRHMI_3be)Lkn#0F*V8Pq
zc}`Cjy$bE;FJ6H7p=0y#R>`}-m4(0F>%@P|?7fx{=R^uFdISRnZ2W_xQhD{YuR3t<
z{6yxu=4~JkeA;|(J6_nv#>Nvs&FuLA&PW^he@t(UwFFE8)|a!R{`E`K`i^ZnyE4$k
z;(749Ix|oi$c3QbEJ3b~D_kQsPz~fIUKym($a_7dJ?o+40*OLl^{=&oq$<#Q(yyrp
z{J-FAniyAw9tPbe&IhQ|a`DqFTVQGQ&Gq3!C2==4x{6EJwiPZ8zub-iXoUtkJiG{}
zPaR&}_fn8_z~(=;5lD-aPWD3z8PZS@AaUiomF!G8I}Mf>e~0g#BelA-5#`cj;O5>N
Xviia!U7SGha1wx#SCgwmn*{w2TRX*I

literal 0
HcmV?d00001

diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py
new file mode 100644
index 00000000..7bdf742c
--- /dev/null
+++ b/document_ocr/tests/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import test_document_ocr
diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
new file mode 100644
index 00000000..b1695da8
--- /dev/null
+++ b/document_ocr/tests/test_document_ocr.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from StringIO import StringIO
+
+from PIL import Image, ImageDraw, ImageFont
+from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
+from openerp.tests.common import TransactionCase
+
+
+class TestDocumentOcr(TransactionCase):
+    def test_document_ocr(self):
+        self.env['ir.config_parameter'].set_param(
+            'document_ocr.synchronous', 'True')
+        test_image = Image.new('RGB', (200, 30))
+        draw = ImageDraw.Draw(test_image)
+        draw.text((3, 3), "Hello world", font=ImageFont.truetype(
+            '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
+        # test a plain image
+        data = StringIO()
+        test_image.save(data, 'png')
+        result = self.env['ir.attachment']._index(
+            data.getvalue(), 'test.png', None)
+        self.assertEqual(result[1].strip(), 'Hello world')
+        # should also work for pdfs
+        data = StringIO()
+        test_image.save(data, 'pdf', resolution=300)
+        result = self.env['ir.attachment']._index(
+            data.getvalue(), 'test.pdf', None)
+        self.assertEqual(result[1].strip(), 'Hello world')
+        # check cron
+        self.env['ir.config_parameter'].set_param(
+            'document_ocr.synchronous', 'False')
+        attachment = self.env['ir.attachment'].create({
+            'name': 'testattachment',
+            'datas': data.getvalue().encode('base64'),
+        })
+        self.assertEqual(attachment.index_content, _MARKER_PHRASE)
+        attachment._ocr_cron()
+        self.assertEqual(attachment.index_content.strip(), 'Hello world')
+        # and for an unreadable image, we expect an error
+        self.env['ir.config_parameter'].set_param(
+            'document_ocr.synchronous', 'True')
+        data = StringIO()
+        test_image = Image.new('1', (200, 30))
+        test_image.save(data, 'Palm')
+        result = self.env['ir.attachment']._index(
+            data.getvalue(), 'test.palm', None)
+        self.assertEqual(result[1], None)

From a58c40621cec20937b68cc943218d3a563f51228 Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Thu, 1 Jun 2017 20:03:58 +0100
Subject: [PATCH 02/16] [MIG] Migration of document_ocr module to 10.0

---
 document_ocr/README.rst                   |  15 ++
 document_ocr/__init__.py                  |   1 +
 document_ocr/__manifest__.py              |   6 +-
 document_ocr/data/ir_config_parameter.xml |   8 +
 document_ocr/models/__init__.py           |   1 +
 document_ocr/models/ir_attachment.py      | 263 ++++++++++++++++++----
 document_ocr/tests/__init__.py            |   1 +
 document_ocr/tests/test_document_ocr.py   |   5 +-
 document_ocr/views/ir_attachment_view.xml |  43 ++++
 9 files changed, 293 insertions(+), 50 deletions(-)
 create mode 100644 document_ocr/views/ir_attachment_view.xml

diff --git a/document_ocr/README.rst b/document_ocr/README.rst
index 7f9c3b28..c500f0b1 100644
--- a/document_ocr/README.rst
+++ b/document_ocr/README.rst
@@ -39,6 +39,21 @@ This is because the recognition process takes a while and you don't want to make
 The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
 In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
 
+
+By default, recognition language is set to english.
+In case you want to use a different default, set configuration parameter ``document_ocr.language`` to value respective value ex:``por``, for Portuguese.
+
+
+In PDF case, OCR will run after it will be converted to an image. But OCR will be applied to all PDFs.
+
+
+System parameters used:
+#``document_ocr.synchronous``:  bool
+#``document_ocr.language``:  string
+#``document_ocr.dpi``:  integer
+#``document_ocr.quality``:  integer
+
+
 .. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
     :alt: Try me on Runbot
     :target: https://runbot.odoo-community.org/runbot/118/10.0
diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py
index 7eda98a2..472456b6 100644
--- a/document_ocr/__init__.py
+++ b/document_ocr/__init__.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
+# © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from . import models
diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py
index 382e77d6..39d783d1 100644
--- a/document_ocr/__manifest__.py
+++ b/document_ocr/__manifest__.py
@@ -2,9 +2,9 @@
 # © 2016 Therp BV <http://therp.nl>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 {
-    "name": "OCR for documents",
+    "name": "OCR for Documents",
     "version": "10.0.1.0.0",
-    "author": "Therp BV,Odoo Community Association (OCA), TKO Brasil",
+    "author": "Therp BV, Odoo Community Association (OCA), ThinkOpen Solutions Brasil",
     "license": "AGPL-3",
     "category": "Knowledge Management",
     "summary": "Run character recognition on uploaded files",
@@ -14,10 +14,12 @@
     "data": [
         "data/ir_cron.xml",
         "data/ir_config_parameter.xml",
+        "views/ir_attachment_view.xml",
     ],
     "external_dependencies": {
         'bin': [
             'tesseract',
+            'convert',
         ],
     },
 }
diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml
index e46db18a..721a0740 100644
--- a/document_ocr/data/ir_config_parameter.xml
+++ b/document_ocr/data/ir_config_parameter.xml
@@ -9,5 +9,13 @@
             <field name="key">document_ocr.dpi</field>
             <field name="value">300</field>
         </record>
+        <record id="param_quality" model="ir.config_parameter">
+            <field name="key">document_ocr.quality</field>
+            <field name="value">100</field>
+        </record>
+        <record id="param_language" model="ir.config_parameter">
+            <field name="key">document_ocr.language</field>
+            <field name="value">eng</field>
+        </record>
     </data>
 </openerp>
diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py
index a15f1b21..051b3ddf 100644
--- a/document_ocr/models/__init__.py
+++ b/document_ocr/models/__init__.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
+# © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from . import ir_attachment
diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py
index b27992c8..f28e1fc9 100644
--- a/document_ocr/models/ir_attachment.py
+++ b/document_ocr/models/ir_attachment.py
@@ -1,85 +1,256 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
+# © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+import io
 import logging
 import subprocess
 from StringIO import StringIO
 
-from PIL import Image
-from openerp import api, models
+import pyPdf
+from odoo import api, fields, models
+from odoo.exceptions import UserError
 
 _logger = logging.getLogger(__name__)
 _MARKER_PHRASE = '[[waiting for OCR]]'
+OCR_LANGUAGE = [('afr', 'Afrikaans'),
+                ('amh', 'Amharic'),
+                ('ara', 'Arabic'),
+                ('asm', 'Assamese'),
+                ('aze', 'Azerbaijani'),
+                ('aze_cyrl', 'Azerbaijani - Cyrilic'),
+                ('bel', 'Belarusian'),
+                ('ben', 'Bengali'),
+                ('bod', 'Tibetan'),
+                ('bos', 'Bosnian'),
+                ('bul', 'Bulgarian'),
+                ('cat', 'Catalan; Valencian'),
+                ('ceb', 'Cebuano'),
+                ('ces', 'Czech'),
+                ('chi_sim', 'Chinese - Simplified'),
+                ('chi_tra', 'Chinese - Traditional'),
+                ('chr', 'Cherokee'),
+                ('cym', 'Welsh'),
+                ('dan', 'Danish'),
+                ('dan_frak', 'Danish - Fraktur'),
+                ('deu', 'German'),
+                ('deu_frak', 'German - Fraktur'),
+                ('dzo', 'Dzongkha'),
+                ('ell', 'Greek, Modern (1453-)'),
+                ('eng', 'English'),
+                ('enm', 'English, Middle (1100-1500)'),
+                ('epo', 'Esperanto'),
+                ('equ', 'Math / equation detection module'),
+                ('est', 'Estonian'),
+                ('eus', 'Basque'),
+                ('fas', 'Persian'),
+                ('fin', 'Finnish'),
+                ('fra', 'French'),
+                ('frk', 'Frankish'),
+                ('frm', 'French, Middle (ca.1400-1600)'),
+                ('gle', 'Irish'),
+                ('glg', 'Galician'),
+                ('grc', 'Greek, Ancient (to 1453)'),
+                ('guj', 'Gujarati'),
+                ('hat', 'Haitian; Haitian Creole'),
+                ('heb', 'Hebrew'),
+                ('hin', 'Hindi'),
+                ('hrv', 'Croatian'),
+                ('hun', 'Hungarian'),
+                ('iku', 'Inuktitut'),
+                ('ind', 'Indonesian'),
+                ('isl', 'Icelandic'),
+                ('ita', 'Italian'),
+                ('ita_old', 'Italian - Old'),
+                ('jav', 'Javanese'),
+                ('jpn', 'Japanese'),
+                ('kan', 'Kannada'),
+                ('kat', 'Georgian'),
+                ('kat_old', 'Georgian - Old'),
+                ('kaz', 'Kazakh'),
+                ('khm', 'Central Khmer'),
+                ('kir', 'Kirghiz; Kyrgyz'),
+                ('kor', 'Korean'),
+                ('kur', 'Kurdish'),
+                ('lao', 'Lao'),
+                ('lat', 'Latin'),
+                ('lav', 'Latvian'),
+                ('lit', 'Lithuanian'),
+                ('mal', 'Malayalam'),
+                ('mar', 'Marathi'),
+                ('mkd', 'Macedonian'),
+                ('mlt', 'Maltese'),
+                ('msa', 'Malay'),
+                ('mya', 'Burmese'),
+                ('nep', 'Nepali'),
+                ('nld', 'Dutch; Flemish'),
+                ('nor', 'Norwegian'),
+                ('ori', 'Oriya'),
+                ('osd', 'Orientation and script detection module'),
+                ('pan', 'Panjabi; Punjabi'),
+                ('pol', 'Polish'),
+                ('por', 'Portuguese'),
+                ('pus', 'Pushto; Pashto'),
+                ('ron', 'Romanian; Moldavian; Moldovan'),
+                ('rus', 'Russian'),
+                ('san', 'Sanskrit'),
+                ('sin', 'Sinhala; Sinhalese'),
+                ('slk', 'Slovak'),
+                ('slk_frak', 'Slovak - Fraktur'),
+                ('slv', 'Slovenian'),
+                ('spa', 'Spanish; Castilian'),
+                ('spa_old', 'Spanish; Castilian - Old'),
+                ('sqi', 'Albanian'),
+                ('srp', 'Serbian'),
+                ('srp_latn', 'Serbian - Latin'),
+                ('swa', 'Swahili'),
+                ('swe', 'Swedish'),
+                ('syr', 'Syriac'),
+                ('tam', 'Tamil'),
+                ('tel', 'Telugu'),
+                ('tgk', 'Tajik'),
+                ('tgl', 'Tagalog'),
+                ('tha', 'Thai'),
+                ('tir', 'Tigrinya'),
+                ('tur', 'Turkish'),
+                ('uig', 'Uighur; Uyghur'),
+                ('ukr', 'Ukrainian'),
+                ('urd', 'Urdu'),
+                ('uzb', 'Uzbek'),
+                ('uzb_cyrl', 'Uzbek - Cyrilic'),
+                ('vie', 'Vietnamese'),
+                ('yid', 'Yiddish'), ]
 
 
 class IrAttachment(models.Model):
     _inherit = 'ir.attachment'
 
+    language = fields.Selection(OCR_LANGUAGE, 'Language')
+    # We need to redefine index_content field to be able to update it
+    # on the onchange_language()
+    index_content = fields.Text('Indexed Content', readonly=False, prefetch=False)
+    index_content_rel = fields.Text(related='index_content', string='Indexed Content Rel')
+
+    @api.onchange('language')
+    def onchange_language(self):
+        process = subprocess.Popen(['tesseract', '--list-langs'],
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+        if self.language not in stderr.split('\n'):
+            raise UserError(
+                "Language not installed."
+                " Please ask your system administrator to"
+                " install tesseract '%s' language." %
+                self.language)
+        if self.store_fname:
+            bin_data = self._file_read(self.store_fname)
+        else:
+            bin_data = self.db_datas
+        index_content = self._index(
+            bin_data.decode('base64'), self.datas_fname, self.mimetype)
+        return {'value': {
+            'index_content': index_content}}
+
     @api.model
-    def _index(self, data, datas_fname, file_type):
-        mimetype, content = super(IrAttachment, self)._index(
-            data, datas_fname, file_type)
+    def _index(self, bin_data, datas_fname, mimetype):
+        if not self.language:
+            # Set default language
+            self.language = self.env['ir.config_parameter'].get_param(
+                'document_ocr.language', 'eng')
+        content = super(IrAttachment, self)._index(
+            bin_data, datas_fname, mimetype)
         if not content or content == 'image':
             has_synchr_param = self.env['ir.config_parameter'].get_param(
                 'document_ocr.synchronous', 'False') == 'True'
             has_force_flag = self.env.context.get('document_ocr_force')
-            if has_synchr_param or has_force_flag:
-                content = self._index_ocr(mimetype, data, datas_fname,
-                                          file_type)
+            synchr = has_synchr_param or has_force_flag
+            if synchr:
+                content = self._index_ocr(bin_data)
             else:
                 content = _MARKER_PHRASE
+        return content
 
-        return mimetype, content
-
-    @api.model
-    def _index_ocr(self, mimetype, data, datas_fname, file_type):
-        dpi = int(
-            self.env['ir.config_parameter'].get_param(
-                'document_ocr.dpi', '500'))
-        top_type, sub_type = mimetype.split('/', 1)
-        if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
-            image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
-                data, datas_fname, file_type, dpi)
-        else:
-            image_data = StringIO()
-            try:
-                Image.open(StringIO(data)).save(image_data, 'tiff',
-                                                dpi=(dpi, dpi))
-            except IOError:
-                _logger.exception('Failed to OCR image')
-                return None
+    def _index_ocr(self, bin_data):
         process = subprocess.Popen(
-            ['tesseract', 'stdin', 'stdout'],
+            ['tesseract', 'stdin', 'stdout', '-l', self.language],
             stdin=subprocess.PIPE, stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
         )
-        stdout, stderr = process.communicate(image_data.getvalue())
+        stdout, stderr = process.communicate(bin_data)
         if stderr:
             _logger.error('Error during OCR: %s', stderr)
         return stdout
 
-    @api.model
-    def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
-        process = subprocess.Popen(
-            ['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
-            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        stdout, stderr = process.communicate(data)
-        if stderr:
-            _logger.error('Error converting to PDF: %s', stderr)
-        return StringIO(stdout)
+    def _index_pdf(self, bin_data):
+
+        def convert_bin_to_image(self, bin_data):
+            dpi = int(self.env['ir.config_parameter'].get_param(
+                'document_ocr.dpi', '500'))
+            quality = int(self.env['ir.config_parameter'].get_param(
+                'document_ocr.quality', '100'))
+            process = subprocess.Popen(
+                ['convert', '-density', str(dpi),
+                 '-quality', str(quality),
+                 '-', '-append', 'png32:-'],
+                stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+            stdout, stderr = process.communicate(bin_data)
+            if stderr:
+                _logger.error('Error converting PDF to image: %s', stderr)
+            return stdout
+
+        def _convert_pdf_page_to_image(self, pdf, pagenum):
+            dst_pdf = pyPdf.PdfFileWriter()
+            dst_pdf.addPage(pdf.getPage(pagenum))
+            pdf_bytes = io.BytesIO()
+            dst_pdf.write(pdf_bytes)
+            pdf_bytes.seek(0)
+            return convert_bin_to_image(self, pdf_bytes.read())
+
+        has_synchr_param = self.env['ir.config_parameter'].get_param(
+            'document_ocr.synchronous', 'False') == 'True'
+        has_force_flag = self.env.context.get('document_ocr_force')
+        synchr = has_synchr_param or has_force_flag
+        if synchr:
+            buf = super(IrAttachment, self)._index_pdf(bin_data)
+            if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'):
+                # If we got less than 2 lines, run OCR and append to existent text
+                try:
+                    f = StringIO(bin_data)
+                    pdf = pyPdf.PdfFileReader(f)
+                    if pdf.getNumPages() > 1:
+                        for pagenum in range(0, pdf.getNumPages()):
+                            _logger.info('OCR PDF "%s" page %d/%d...',
+                                         self.datas_fname,
+                                         pagenum + 1,
+                                         pdf.getNumPages())
+                            pdf_image = _convert_pdf_page_to_image(self, pdf,
+                                                                   pagenum)
+                            index_content = self._index_ocr(pdf_image)
+                            buf = u'%s\n-- %d --\n%s' % (
+                                buf, pagenum + 1, index_content.decode('utf8'))
+                    else:
+                        _logger.info('OCR PDF "%s"...', self.datas_fname)
+                        pdf_image = convert_bin_to_image(self, bin_data)
+                        index_content = self._index_ocr(pdf_image)
+                        buf = u'%s\n%s' % (buf, index_content.decode('utf8'))
+                except Exception as e:
+                    _logger.error('Error converting PDF to image: %s', e)
+                    pass
+        else:
+            buf = _MARKER_PHRASE
+        return buf
 
     @api.model
     def _ocr_cron(self):
-        for this in self.with_context(document_ocr_force=True).search([
-            ('index_content', '=', _MARKER_PHRASE),
-        ]):
+        for this in self.with_context(document_ocr_force=True).search(
+                [('index_content', '=', _MARKER_PHRASE)]):
             if not this.datas:
                 continue
-            file_type, index_content = this._index(
-                this.datas.decode('base64'), this.datas_fname, this.file_type)
+            index_content = this._index(
+                this.datas.decode('base64'), this.datas_fname, this.mimetype)
             this.write({
-                'file_type': file_type,
                 'index_content': index_content,
             })
diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py
index 7bdf742c..7efb2857 100644
--- a/document_ocr/tests/__init__.py
+++ b/document_ocr/tests/__init__.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
+# © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from . import test_document_ocr
diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index b1695da8..e54a6ac0 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -1,11 +1,12 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
+# © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from StringIO import StringIO
 
 from PIL import Image, ImageDraw, ImageFont
-from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
-from openerp.tests.common import TransactionCase
+from odoo.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
+from odoo.tests.common import TransactionCase
 
 
 class TestDocumentOcr(TransactionCase):
diff --git a/document_ocr/views/ir_attachment_view.xml b/document_ocr/views/ir_attachment_view.xml
new file mode 100644
index 00000000..ed171d61
--- /dev/null
+++ b/document_ocr/views/ir_attachment_view.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="utf-8"?>
+<odoo>
+    <!-- Attachment -->
+    <record id="view_attachment_form" model="ir.ui.view">
+        <field name="model">ir.attachment</field>
+        <field name="inherit_id" ref="base.view_attachment_form"/>
+        <field name="arch" type="xml">
+            <xpath expr="(//sheet/group/group)[last()]" position="attributes">
+                <attribute name="invisible">1</attribute>
+            </xpath>
+            <xpath expr="(//sheet/group/group)[last()]" position="before">
+                <group groups="base.group_no_one" string="Indexed Content" colspan="4">
+                <field name="index_content_rel" readonly="1" nolabel="1"/>
+                </group>
+            </xpath>
+            <field name="mimetype" position="after">
+                <field name="store_fname" invisible="1"/>
+                <field name="language"/>
+            </field>
+        </field>
+    </record>
+    <record id="view_attachment_tree" model="ir.ui.view">
+        <field name="model">ir.attachment</field>
+        <field name="inherit_id" ref="base.view_attachment_tree"/>
+        <field name="arch" type="xml">
+            <field name="type" position="after">
+                <field name="language"/>
+            </field>
+        </field>
+    </record>
+    <record id="view_attachment_search" model="ir.ui.view">
+        <field name="model">ir.attachment</field>
+        <field name="inherit_id" ref="base.view_attachment_search"/>
+        <field name="arch" type="xml">
+            <field name="name" position="after">
+                <field name="language"/>
+            </field>
+            <filter name="owner" position="after">
+                <filter string="Language" domain="[]" context="{'group_by':'language'}" groups="base.group_no_one"/>
+            </filter>
+        </field>
+    </record>
+</odoo>

From caf585626671545005775e4a284af2408be8ef3c Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Fri, 2 Jun 2017 10:03:34 +0100
Subject: [PATCH 03/16] Fixes Flake8 errors

Add requirements.txt for tesseract
---
 document_ocr/__manifest__.py            |  4 +++-
 document_ocr/models/ir_attachment.py    | 18 +++++++++++-------
 document_ocr/tests/test_document_ocr.py |  2 +-
 requirements.txt                        |  1 +
 4 files changed, 16 insertions(+), 9 deletions(-)
 create mode 100644 requirements.txt

diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py
index 39d783d1..ad012794 100644
--- a/document_ocr/__manifest__.py
+++ b/document_ocr/__manifest__.py
@@ -4,7 +4,9 @@
 {
     "name": "OCR for Documents",
     "version": "10.0.1.0.0",
-    "author": "Therp BV, Odoo Community Association (OCA), ThinkOpen Solutions Brasil",
+    "author": "Therp BV,"
+              " Odoo Community Association (OCA),"
+              " ThinkOpen Solutions Brasil",
     "license": "AGPL-3",
     "category": "Knowledge Management",
     "summary": "Run character recognition on uploaded files",
diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py
index f28e1fc9..ef683a37 100644
--- a/document_ocr/models/ir_attachment.py
+++ b/document_ocr/models/ir_attachment.py
@@ -9,7 +9,7 @@ import subprocess
 from StringIO import StringIO
 
 import pyPdf
-from odoo import api, fields, models
+from odoo import api, fields, models, _
 from odoo.exceptions import UserError
 
 _logger = logging.getLogger(__name__)
@@ -126,11 +126,14 @@ OCR_LANGUAGE = [('afr', 'Afrikaans'),
 class IrAttachment(models.Model):
     _inherit = 'ir.attachment'
 
-    language = fields.Selection(OCR_LANGUAGE, 'Language')
+    language = fields.Selection(OCR_LANGUAGE, _('Language'))
     # We need to redefine index_content field to be able to update it
     # on the onchange_language()
-    index_content = fields.Text('Indexed Content', readonly=False, prefetch=False)
-    index_content_rel = fields.Text(related='index_content', string='Indexed Content Rel')
+    index_content = fields.Text(_('Indexed Content'),
+                                readonly=False,
+                                prefetch=False)
+    index_content_rel = fields.Text(related='index_content',
+                                    string=_('Indexed Content Rel'))
 
     @api.onchange('language')
     def onchange_language(self):
@@ -139,11 +142,11 @@ class IrAttachment(models.Model):
                                    stderr=subprocess.PIPE)
         stdout, stderr = process.communicate()
         if self.language not in stderr.split('\n'):
-            raise UserError(
+            raise UserError(_(
                 "Language not installed."
                 " Please ask your system administrator to"
                 " install tesseract '%s' language." %
-                self.language)
+                self.language))
         if self.store_fname:
             bin_data = self._file_read(self.store_fname)
         else:
@@ -216,7 +219,8 @@ class IrAttachment(models.Model):
         if synchr:
             buf = super(IrAttachment, self)._index_pdf(bin_data)
             if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'):
-                # If we got less than 2 lines, run OCR and append to existent text
+                # If we got less than 2 lines,
+                # run OCR anyway and append to existent text
                 try:
                     f = StringIO(bin_data)
                     pdf = pyPdf.PdfFileReader(f)
diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index e54a6ac0..10c253c6 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -5,7 +5,7 @@
 from StringIO import StringIO
 
 from PIL import Image, ImageDraw, ImageFont
-from odoo.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
+from models.ir_attachment import _MARKER_PHRASE
 from odoo.tests.common import TransactionCase
 
 
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..943dea2b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+tesseract
\ No newline at end of file

From ca0eed717a31b5593db38159b1ba411c1b072ea1 Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Fri, 2 Jun 2017 10:14:40 +0100
Subject: [PATCH 04/16] Remove _() from fields

---
 document_ocr/models/ir_attachment.py    | 6 +++---
 document_ocr/tests/test_document_ocr.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py
index ef683a37..7247d3e4 100644
--- a/document_ocr/models/ir_attachment.py
+++ b/document_ocr/models/ir_attachment.py
@@ -126,14 +126,14 @@ OCR_LANGUAGE = [('afr', 'Afrikaans'),
 class IrAttachment(models.Model):
     _inherit = 'ir.attachment'
 
-    language = fields.Selection(OCR_LANGUAGE, _('Language'))
+    language = fields.Selection(OCR_LANGUAGE, 'Language')
     # We need to redefine index_content field to be able to update it
     # on the onchange_language()
-    index_content = fields.Text(_('Indexed Content'),
+    index_content = fields.Text('Indexed Content',
                                 readonly=False,
                                 prefetch=False)
     index_content_rel = fields.Text(related='index_content',
-                                    string=_('Indexed Content Rel'))
+                                    string='Indexed Content Rel')
 
     @api.onchange('language')
     def onchange_language(self):
diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index 10c253c6..8a72a9d9 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -5,7 +5,7 @@
 from StringIO import StringIO
 
 from PIL import Image, ImageDraw, ImageFont
-from models.ir_attachment import _MARKER_PHRASE
+from document_ocr.models.ir_attachment import _MARKER_PHRASE
 from odoo.tests.common import TransactionCase
 
 

From 324a53d4c144cc1be8aaa50a804764b80424825e Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Fri, 2 Jun 2017 10:18:10 +0100
Subject: [PATCH 05/16] Fix requirements tesseract dependency library name

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 943dea2b..3f0cdf37 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-tesseract
\ No newline at end of file
+pytesseract
\ No newline at end of file

From 125d76d37d860271eac3b770a495d7fa40a1ce35 Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Fri, 2 Jun 2017 10:28:57 +0100
Subject: [PATCH 06/16] Fix attachment import

---
 document_ocr/tests/test_document_ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index 8a72a9d9..393dbfa4 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -5,7 +5,7 @@
 from StringIO import StringIO
 
 from PIL import Image, ImageDraw, ImageFont
-from document_ocr.models.ir_attachment import _MARKER_PHRASE
+from ir_attachment import _MARKER_PHRASE
 from odoo.tests.common import TransactionCase
 
 

From 87b07980544cfddf0780f1cc4c78604df78aed75 Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Fri, 2 Jun 2017 10:32:36 +0100
Subject: [PATCH 07/16] Fix requirements

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3f0cdf37..3a0b698d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
-pytesseract
\ No newline at end of file
+ tesseract-ocr
+ tesseract-ocr-eng

From 5a3a2e8d60f85f7d9af59dcfb98839317e19ecef Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Fri, 2 Jun 2017 10:39:29 +0100
Subject: [PATCH 08/16] Fix import reference

---
 document_ocr/tests/test_document_ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index 393dbfa4..9765e291 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -5,7 +5,7 @@
 from StringIO import StringIO
 
 from PIL import Image, ImageDraw, ImageFont
-from ir_attachment import _MARKER_PHRASE
+from ..models.ir_attachment import _MARKER_PHRASE
 from odoo.tests.common import TransactionCase
 
 

From 94cfc53685cab12a9a6aca2e957b4ce060c9ddac Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Mon, 5 Jun 2017 15:45:23 +0100
Subject: [PATCH 09/16] Update travis.yml

---
 .travis.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 7c091932..0c92807d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,10 @@ addons:
     packages:
       - expect-dev  # provides unbuffer utility
       - python-lxml # because pip installation is slow
+      - tesseract-ocr # document_ocr
+      - tesseract-ocr-eng # document_ocr
+      - imagemagick # document_ocr
+      - fonts-inconsolata # document_ocr (for tests only)
 
 language: python
 

From 0c2740de22b56fc5e77dd9d837e8f0ebfc8afcee Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Mon, 5 Jun 2017 15:50:19 +0100
Subject: [PATCH 10/16] Remove unecessary requirements.txt file, dependency
 added as package in travis.yml

---
 requirements.txt | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 3a0b698d..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
- tesseract-ocr
- tesseract-ocr-eng

From 21526bd236b7b4a3054aaee65ce8c3e7b28a3dbf Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Tue, 6 Jun 2017 11:48:55 +0100
Subject: [PATCH 11/16] Make tests to work, on 10.0

Fix small bugs
---
 document_ocr/models/ir_attachment.py    | 19 ++++++++++---------
 document_ocr/tests/test_document_ocr.py | 23 +++++++++++++----------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py
index 7247d3e4..efbd9b18 100644
--- a/document_ocr/models/ir_attachment.py
+++ b/document_ocr/models/ir_attachment.py
@@ -126,7 +126,10 @@ OCR_LANGUAGE = [('afr', 'Afrikaans'),
 class IrAttachment(models.Model):
     _inherit = 'ir.attachment'
 
-    language = fields.Selection(OCR_LANGUAGE, 'Language')
+    language = fields.Selection(OCR_LANGUAGE, 'Language',
+                                default=lambda self:
+                                self.env['ir.config_parameter'].get_param(
+                                    'document_ocr.language', 'eng'))
     # We need to redefine index_content field to be able to update it
     # on the onchange_language()
     index_content = fields.Text('Indexed Content',
@@ -151,17 +154,15 @@ class IrAttachment(models.Model):
             bin_data = self._file_read(self.store_fname)
         else:
             bin_data = self.db_datas
-        index_content = self._index(
-            bin_data.decode('base64'), self.datas_fname, self.mimetype)
-        return {'value': {
-            'index_content': index_content}}
+        if bin_data:
+            index_content = self._index(
+                bin_data.decode('base64'), self.datas_fname, self.mimetype)
+            return {'value': {
+                'index_content': index_content}}
+        return {'value': {}}
 
     @api.model
     def _index(self, bin_data, datas_fname, mimetype):
-        if not self.language:
-            # Set default language
-            self.language = self.env['ir.config_parameter'].get_param(
-                'document_ocr.language', 'eng')
         content = super(IrAttachment, self)._index(
             bin_data, datas_fname, mimetype)
         if not content or content == 'image':
diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index 9765e291..fa5c6137 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -4,10 +4,11 @@
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from StringIO import StringIO
 
-from PIL import Image, ImageDraw, ImageFont
-from ..models.ir_attachment import _MARKER_PHRASE
+from PIL import Image, ImageDraw, ImageFont, PdfImagePlugin, PalmImagePlugin
 from odoo.tests.common import TransactionCase
 
+from ..models.ir_attachment import _MARKER_PHRASE
+
 
 class TestDocumentOcr(TransactionCase):
     def test_document_ocr(self):
@@ -20,15 +21,17 @@ class TestDocumentOcr(TransactionCase):
         # test a plain image
         data = StringIO()
         test_image.save(data, 'png')
-        result = self.env['ir.attachment']._index(
+        attachment = self.env['ir.attachment'].create({
+            'name': 'testattachment'})
+        result = attachment._index(
             data.getvalue(), 'test.png', None)
-        self.assertEqual(result[1].strip(), 'Hello world')
+        self.assertEqual(result.strip(), 'Hello world')
         # should also work for pdfs
         data = StringIO()
         test_image.save(data, 'pdf', resolution=300)
-        result = self.env['ir.attachment']._index(
+        result = attachment._index(
             data.getvalue(), 'test.pdf', None)
-        self.assertEqual(result[1].strip(), 'Hello world')
+        self.assertEqual(result.strip(), 'Hello world')
         # check cron
         self.env['ir.config_parameter'].set_param(
             'document_ocr.synchronous', 'False')
@@ -39,12 +42,12 @@ class TestDocumentOcr(TransactionCase):
         self.assertEqual(attachment.index_content, _MARKER_PHRASE)
         attachment._ocr_cron()
         self.assertEqual(attachment.index_content.strip(), 'Hello world')
-        # and for an unreadable image, we expect an error
+        # and for an unreadable image, we expect an empty string
         self.env['ir.config_parameter'].set_param(
             'document_ocr.synchronous', 'True')
         data = StringIO()
         test_image = Image.new('1', (200, 30))
-        test_image.save(data, 'Palm')
-        result = self.env['ir.attachment']._index(
+        test_image.save(data, 'palm')
+        result = attachment._index(
             data.getvalue(), 'test.palm', None)
-        self.assertEqual(result[1], None)
+        self.assertEqual(result, '')

From f1c5c8238b54bae3afda69c512f150d0cb75ee4c Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Tue, 6 Jun 2017 11:59:56 +0100
Subject: [PATCH 12/16] Unused import flake8 error

---
 document_ocr/tests/test_document_ocr.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index fa5c6137..d2bff780 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -28,6 +28,7 @@ class TestDocumentOcr(TransactionCase):
         self.assertEqual(result.strip(), 'Hello world')
         # should also work for pdfs
         data = StringIO()
+        PdfImagePlugin  # to use import :/
         test_image.save(data, 'pdf', resolution=300)
         result = attachment._index(
             data.getvalue(), 'test.pdf', None)
@@ -47,6 +48,7 @@ class TestDocumentOcr(TransactionCase):
             'document_ocr.synchronous', 'True')
         data = StringIO()
         test_image = Image.new('1', (200, 30))
+        PalmImagePlugin  # to use import :/
         test_image.save(data, 'palm')
         result = attachment._index(
             data.getvalue(), 'test.palm', None)

From b22221023c75e0e464f02d531b4dfcc127062c63 Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Tue, 6 Jun 2017 12:53:08 +0100
Subject: [PATCH 13/16] Deal with unused import

---
 document_ocr/tests/test_document_ocr.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index d2bff780..bf9c9b36 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -4,7 +4,8 @@
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from StringIO import StringIO
 
-from PIL import Image, ImageDraw, ImageFont, PdfImagePlugin, PalmImagePlugin
+from PIL import Image, ImageDraw, ImageFont
+from PIL import PdfImagePlugin, PalmImagePlugin # pylint: disable=unused-import
 from odoo.tests.common import TransactionCase
 
 from ..models.ir_attachment import _MARKER_PHRASE
@@ -28,7 +29,6 @@ class TestDocumentOcr(TransactionCase):
         self.assertEqual(result.strip(), 'Hello world')
         # should also work for pdfs
         data = StringIO()
-        PdfImagePlugin  # to use import :/
         test_image.save(data, 'pdf', resolution=300)
         result = attachment._index(
             data.getvalue(), 'test.pdf', None)
@@ -48,7 +48,6 @@ class TestDocumentOcr(TransactionCase):
             'document_ocr.synchronous', 'True')
         data = StringIO()
         test_image = Image.new('1', (200, 30))
-        PalmImagePlugin  # to use import :/
         test_image.save(data, 'palm')
         result = attachment._index(
             data.getvalue(), 'test.palm', None)

From f7903b10da70bc6bbf030567754d30a736c558d3 Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Tue, 6 Jun 2017 12:55:35 +0100
Subject: [PATCH 14/16] Also ignore unused import for flake8

---
 document_ocr/tests/test_document_ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index bf9c9b36..3d4cf69f 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -5,7 +5,7 @@
 from StringIO import StringIO
 
 from PIL import Image, ImageDraw, ImageFont
-from PIL import PdfImagePlugin, PalmImagePlugin # pylint: disable=unused-import
+from PIL import PdfImagePlugin, PalmImagePlugin # noqa # pylint: disable=unused-import
 from odoo.tests.common import TransactionCase
 
 from ..models.ir_attachment import _MARKER_PHRASE

From f80491a589708cb41b3f8d67a498c0f0023b543e Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Tue, 6 Jun 2017 14:13:14 +0100
Subject: [PATCH 15/16] Improve logger messages

---
 document_ocr/models/ir_attachment.py    |  2 +-
 document_ocr/tests/test_document_ocr.py | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py
index efbd9b18..18102d52 100644
--- a/document_ocr/models/ir_attachment.py
+++ b/document_ocr/models/ir_attachment.py
@@ -177,6 +177,7 @@ class IrAttachment(models.Model):
         return content
 
     def _index_ocr(self, bin_data):
+        _logger.info('OCR IMAGE "%s"...', self.datas_fname)
         process = subprocess.Popen(
             ['tesseract', 'stdin', 'stdout', '-l', self.language],
             stdin=subprocess.PIPE, stdout=subprocess.PIPE,
@@ -237,7 +238,6 @@ class IrAttachment(models.Model):
                             buf = u'%s\n-- %d --\n%s' % (
                                 buf, pagenum + 1, index_content.decode('utf8'))
                     else:
-                        _logger.info('OCR PDF "%s"...', self.datas_fname)
                         pdf_image = convert_bin_to_image(self, bin_data)
                         index_content = self._index_ocr(pdf_image)
                         buf = u'%s\n%s' % (buf, index_content.decode('utf8'))
diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
index 3d4cf69f..1d1a5490 100644
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@@ -23,13 +23,17 @@ class TestDocumentOcr(TransactionCase):
         data = StringIO()
         test_image.save(data, 'png')
         attachment = self.env['ir.attachment'].create({
-            'name': 'testattachment'})
+            'name': 'testattachment',
+            'datas_fname': 'test_png.pdf'})
         result = attachment._index(
             data.getvalue(), 'test.png', None)
         self.assertEqual(result.strip(), 'Hello world')
         # should also work for pdfs
         data = StringIO()
         test_image.save(data, 'pdf', resolution=300)
+        attachment = self.env['ir.attachment'].create({
+            'name': 'testattachment',
+            'datas_fname': 'test_pdf.pdf'})
         result = attachment._index(
             data.getvalue(), 'test.pdf', None)
         self.assertEqual(result.strip(), 'Hello world')
@@ -38,6 +42,7 @@ class TestDocumentOcr(TransactionCase):
             'document_ocr.synchronous', 'False')
         attachment = self.env['ir.attachment'].create({
             'name': 'testattachment',
+            'datas_fname': 'test_cron.pdf',
             'datas': data.getvalue().encode('base64'),
         })
         self.assertEqual(attachment.index_content, _MARKER_PHRASE)
@@ -49,6 +54,9 @@ class TestDocumentOcr(TransactionCase):
         data = StringIO()
         test_image = Image.new('1', (200, 30))
         test_image.save(data, 'palm')
+        attachment = self.env['ir.attachment'].create({
+            'name': 'testattachment',
+            'datas_fname': 'test_err.palm'})
         result = attachment._index(
             data.getvalue(), 'test.palm', None)
         self.assertEqual(result, '')

From 3b3c60b348b44c390d048984c64e1fa444553df2 Mon Sep 17 00:00:00 2001
From: Carlos Almeida <carlos.almeida@tkobr.com>
Date: Tue, 6 Jun 2017 15:55:09 +0100
Subject: [PATCH 16/16] Remove migrated document_ocr module

---
 document_ocr/README.rst                   | 101 ---------
 document_ocr/__init__.py                  |   5 -
 document_ocr/__manifest__.py              |  27 ---
 document_ocr/data/ir_config_parameter.xml |  21 --
 document_ocr/data/ir_cron.xml             |  13 --
 document_ocr/models/__init__.py           |   5 -
 document_ocr/models/ir_attachment.py      | 261 ----------------------
 document_ocr/static/description/icon.png  | Bin 9455 -> 0 bytes
 document_ocr/tests/__init__.py            |   5 -
 document_ocr/tests/test_document_ocr.py   |  62 -----
 document_ocr/views/ir_attachment_view.xml |  43 ----
 11 files changed, 543 deletions(-)
 delete mode 100644 document_ocr/README.rst
 delete mode 100644 document_ocr/__init__.py
 delete mode 100644 document_ocr/__manifest__.py
 delete mode 100644 document_ocr/data/ir_config_parameter.xml
 delete mode 100644 document_ocr/data/ir_cron.xml
 delete mode 100644 document_ocr/models/__init__.py
 delete mode 100644 document_ocr/models/ir_attachment.py
 delete mode 100644 document_ocr/static/description/icon.png
 delete mode 100644 document_ocr/tests/__init__.py
 delete mode 100644 document_ocr/tests/test_document_ocr.py
 delete mode 100644 document_ocr/views/ir_attachment_view.xml

diff --git a/document_ocr/README.rst b/document_ocr/README.rst
deleted file mode 100644
index c500f0b1..00000000
--- a/document_ocr/README.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
-    :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
-    :alt: License: AGPL-3
-
-=================
-OCR for documents
-=================
-
-This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
-
-It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
-
-Installation
-============
-
-To install this module, you need to:
-
-#. install tesseract and the language(s) your documents use
-#. if you want to support OCR on PDFs, install imagemagick
-#. install the module itself
-
-On an Debian or Ubuntu system you would typically run::
-
-    $ sudo apt-get install tesseract-ocr imagemagick
-
-
-Configuration
-=============
-
-To configure this module, go to:
-
-#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.*
-
-Usage
-=====
-
-By default, character recognition is done asynchronously by a cronjob at night. 
-This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
-The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
-In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
-
-
-By default, recognition language is set to english.
-In case you want to use a different default, set configuration parameter ``document_ocr.language`` to value respective value ex:``por``, for Portuguese.
-
-
-In PDF case, OCR will run after it will be converted to an image. But OCR will be applied to all PDFs.
-
-
-System parameters used:
-#``document_ocr.synchronous``:  bool
-#``document_ocr.language``:  string
-#``document_ocr.dpi``:  integer
-#``document_ocr.quality``:  integer
-
-
-.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
-    :alt: Try me on Runbot
-    :target: https://runbot.odoo-community.org/runbot/118/10.0
-
-Bug Tracker
-===========
-
-Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
-In case of trouble, please check there if your issue has already been reported.
-If you spotted it first, help us smashing it by providing a detailed and welcomed feedback.
-
-Credits
-=======
-
-The actual work
----------------
-
-* `tesseract <https://github.com/tesseract-ocr>`_
-
-Images
-------
-
-* Odoo Community Association: `Icon <https://github.com/OCA/maintainer-tools/blob/master/template/module/static/description/icon.svg>`_.
-
-Contributors
-------------
-
-* Holger Brunn <hbrunn@therp.nl>  
-
-Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list <mailto:community@mail.odoo.com>`_ or the `appropriate specialized mailinglist <https://odoo-community.org/groups>`_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues.
-
-Maintainer
-----------
-
-.. image:: https://odoo-community.org/logo.png
-   :alt: Odoo Community Association
-   :target: https://odoo-community.org
-
-This module is maintained by the OCA.
-
-OCA, or the Odoo Community Association, is a nonprofit organization whose
-mission is to support the collaborative development of Odoo features and
-promote its widespread use.
-
-To contribute to this module, please visit https://odoo-community.org.
diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py
deleted file mode 100644
index 472456b6..00000000
--- a/document_ocr/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# -*- coding: utf-8 -*-
-# © 2016 Therp BV <http://therp.nl>
-# © 2017 ThinkOpen Solutions <https://tkobr.com>
-# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
-from . import models
diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py
deleted file mode 100644
index ad012794..00000000
--- a/document_ocr/__manifest__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-# © 2016 Therp BV <http://therp.nl>
-# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
-{
-    "name": "OCR for Documents",
-    "version": "10.0.1.0.0",
-    "author": "Therp BV,"
-              " Odoo Community Association (OCA),"
-              " ThinkOpen Solutions Brasil",
-    "license": "AGPL-3",
-    "category": "Knowledge Management",
-    "summary": "Run character recognition on uploaded files",
-    "depends": [
-        'document',
-    ],
-    "data": [
-        "data/ir_cron.xml",
-        "data/ir_config_parameter.xml",
-        "views/ir_attachment_view.xml",
-    ],
-    "external_dependencies": {
-        'bin': [
-            'tesseract',
-            'convert',
-        ],
-    },
-}
diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml
deleted file mode 100644
index 721a0740..00000000
--- a/document_ocr/data/ir_config_parameter.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<openerp>
-    <data noupdate="1">
-        <record id="param_synchronous" model="ir.config_parameter">
-            <field name="key">document_ocr.synchronous</field>
-            <field name="value">False</field>
-        </record>
-        <record id="param_dpi" model="ir.config_parameter">
-            <field name="key">document_ocr.dpi</field>
-            <field name="value">300</field>
-        </record>
-        <record id="param_quality" model="ir.config_parameter">
-            <field name="key">document_ocr.quality</field>
-            <field name="value">100</field>
-        </record>
-        <record id="param_language" model="ir.config_parameter">
-            <field name="key">document_ocr.language</field>
-            <field name="value">eng</field>
-        </record>
-    </data>
-</openerp>
diff --git a/document_ocr/data/ir_cron.xml b/document_ocr/data/ir_cron.xml
deleted file mode 100644
index f69d151a..00000000
--- a/document_ocr/data/ir_cron.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<openerp>
-    <data noupdate="1">
-        <record id="cron" model="ir.cron">
-            <field name="name">Run OCR on uploaded documents</field>
-            <field name="interval_type">days</field>
-            <field name="interval_number">1</field>
-            <field name="model">ir.attachment</field>
-            <field name="function">_ocr_cron</field>
-            <field name="numbercall">-1</field>
-        </record>
-    </data>
-</openerp>
diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py
deleted file mode 100644
index 051b3ddf..00000000
--- a/document_ocr/models/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# -*- coding: utf-8 -*-
-# © 2016 Therp BV <http://therp.nl>
-# © 2017 ThinkOpen Solutions <https://tkobr.com>
-# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
-from . import ir_attachment
diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py
deleted file mode 100644
index 18102d52..00000000
--- a/document_ocr/models/ir_attachment.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# -*- coding: utf-8 -*-
-# © 2016 Therp BV <http://therp.nl>
-# © 2017 ThinkOpen Solutions <https://tkobr.com>
-# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
-
-import io
-import logging
-import subprocess
-from StringIO import StringIO
-
-import pyPdf
-from odoo import api, fields, models, _
-from odoo.exceptions import UserError
-
-_logger = logging.getLogger(__name__)
-_MARKER_PHRASE = '[[waiting for OCR]]'
-OCR_LANGUAGE = [('afr', 'Afrikaans'),
-                ('amh', 'Amharic'),
-                ('ara', 'Arabic'),
-                ('asm', 'Assamese'),
-                ('aze', 'Azerbaijani'),
-                ('aze_cyrl', 'Azerbaijani - Cyrilic'),
-                ('bel', 'Belarusian'),
-                ('ben', 'Bengali'),
-                ('bod', 'Tibetan'),
-                ('bos', 'Bosnian'),
-                ('bul', 'Bulgarian'),
-                ('cat', 'Catalan; Valencian'),
-                ('ceb', 'Cebuano'),
-                ('ces', 'Czech'),
-                ('chi_sim', 'Chinese - Simplified'),
-                ('chi_tra', 'Chinese - Traditional'),
-                ('chr', 'Cherokee'),
-                ('cym', 'Welsh'),
-                ('dan', 'Danish'),
-                ('dan_frak', 'Danish - Fraktur'),
-                ('deu', 'German'),
-                ('deu_frak', 'German - Fraktur'),
-                ('dzo', 'Dzongkha'),
-                ('ell', 'Greek, Modern (1453-)'),
-                ('eng', 'English'),
-                ('enm', 'English, Middle (1100-1500)'),
-                ('epo', 'Esperanto'),
-                ('equ', 'Math / equation detection module'),
-                ('est', 'Estonian'),
-                ('eus', 'Basque'),
-                ('fas', 'Persian'),
-                ('fin', 'Finnish'),
-                ('fra', 'French'),
-                ('frk', 'Frankish'),
-                ('frm', 'French, Middle (ca.1400-1600)'),
-                ('gle', 'Irish'),
-                ('glg', 'Galician'),
-                ('grc', 'Greek, Ancient (to 1453)'),
-                ('guj', 'Gujarati'),
-                ('hat', 'Haitian; Haitian Creole'),
-                ('heb', 'Hebrew'),
-                ('hin', 'Hindi'),
-                ('hrv', 'Croatian'),
-                ('hun', 'Hungarian'),
-                ('iku', 'Inuktitut'),
-                ('ind', 'Indonesian'),
-                ('isl', 'Icelandic'),
-                ('ita', 'Italian'),
-                ('ita_old', 'Italian - Old'),
-                ('jav', 'Javanese'),
-                ('jpn', 'Japanese'),
-                ('kan', 'Kannada'),
-                ('kat', 'Georgian'),
-                ('kat_old', 'Georgian - Old'),
-                ('kaz', 'Kazakh'),
-                ('khm', 'Central Khmer'),
-                ('kir', 'Kirghiz; Kyrgyz'),
-                ('kor', 'Korean'),
-                ('kur', 'Kurdish'),
-                ('lao', 'Lao'),
-                ('lat', 'Latin'),
-                ('lav', 'Latvian'),
-                ('lit', 'Lithuanian'),
-                ('mal', 'Malayalam'),
-                ('mar', 'Marathi'),
-                ('mkd', 'Macedonian'),
-                ('mlt', 'Maltese'),
-                ('msa', 'Malay'),
-                ('mya', 'Burmese'),
-                ('nep', 'Nepali'),
-                ('nld', 'Dutch; Flemish'),
-                ('nor', 'Norwegian'),
-                ('ori', 'Oriya'),
-                ('osd', 'Orientation and script detection module'),
-                ('pan', 'Panjabi; Punjabi'),
-                ('pol', 'Polish'),
-                ('por', 'Portuguese'),
-                ('pus', 'Pushto; Pashto'),
-                ('ron', 'Romanian; Moldavian; Moldovan'),
-                ('rus', 'Russian'),
-                ('san', 'Sanskrit'),
-                ('sin', 'Sinhala; Sinhalese'),
-                ('slk', 'Slovak'),
-                ('slk_frak', 'Slovak - Fraktur'),
-                ('slv', 'Slovenian'),
-                ('spa', 'Spanish; Castilian'),
-                ('spa_old', 'Spanish; Castilian - Old'),
-                ('sqi', 'Albanian'),
-                ('srp', 'Serbian'),
-                ('srp_latn', 'Serbian - Latin'),
-                ('swa', 'Swahili'),
-                ('swe', 'Swedish'),
-                ('syr', 'Syriac'),
-                ('tam', 'Tamil'),
-                ('tel', 'Telugu'),
-                ('tgk', 'Tajik'),
-                ('tgl', 'Tagalog'),
-                ('tha', 'Thai'),
-                ('tir', 'Tigrinya'),
-                ('tur', 'Turkish'),
-                ('uig', 'Uighur; Uyghur'),
-                ('ukr', 'Ukrainian'),
-                ('urd', 'Urdu'),
-                ('uzb', 'Uzbek'),
-                ('uzb_cyrl', 'Uzbek - Cyrilic'),
-                ('vie', 'Vietnamese'),
-                ('yid', 'Yiddish'), ]
-
-
-class IrAttachment(models.Model):
-    _inherit = 'ir.attachment'
-
-    language = fields.Selection(OCR_LANGUAGE, 'Language',
-                                default=lambda self:
-                                self.env['ir.config_parameter'].get_param(
-                                    'document_ocr.language', 'eng'))
-    # We need to redefine index_content field to be able to update it
-    # on the onchange_language()
-    index_content = fields.Text('Indexed Content',
-                                readonly=False,
-                                prefetch=False)
-    index_content_rel = fields.Text(related='index_content',
-                                    string='Indexed Content Rel')
-
-    @api.onchange('language')
-    def onchange_language(self):
-        process = subprocess.Popen(['tesseract', '--list-langs'],
-                                   stdout=subprocess.PIPE,
-                                   stderr=subprocess.PIPE)
-        stdout, stderr = process.communicate()
-        if self.language not in stderr.split('\n'):
-            raise UserError(_(
-                "Language not installed."
-                " Please ask your system administrator to"
-                " install tesseract '%s' language." %
-                self.language))
-        if self.store_fname:
-            bin_data = self._file_read(self.store_fname)
-        else:
-            bin_data = self.db_datas
-        if bin_data:
-            index_content = self._index(
-                bin_data.decode('base64'), self.datas_fname, self.mimetype)
-            return {'value': {
-                'index_content': index_content}}
-        return {'value': {}}
-
-    @api.model
-    def _index(self, bin_data, datas_fname, mimetype):
-        content = super(IrAttachment, self)._index(
-            bin_data, datas_fname, mimetype)
-        if not content or content == 'image':
-            has_synchr_param = self.env['ir.config_parameter'].get_param(
-                'document_ocr.synchronous', 'False') == 'True'
-            has_force_flag = self.env.context.get('document_ocr_force')
-            synchr = has_synchr_param or has_force_flag
-            if synchr:
-                content = self._index_ocr(bin_data)
-            else:
-                content = _MARKER_PHRASE
-        return content
-
-    def _index_ocr(self, bin_data):
-        _logger.info('OCR IMAGE "%s"...', self.datas_fname)
-        process = subprocess.Popen(
-            ['tesseract', 'stdin', 'stdout', '-l', self.language],
-            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        stdout, stderr = process.communicate(bin_data)
-        if stderr:
-            _logger.error('Error during OCR: %s', stderr)
-        return stdout
-
-    def _index_pdf(self, bin_data):
-
-        def convert_bin_to_image(self, bin_data):
-            dpi = int(self.env['ir.config_parameter'].get_param(
-                'document_ocr.dpi', '500'))
-            quality = int(self.env['ir.config_parameter'].get_param(
-                'document_ocr.quality', '100'))
-            process = subprocess.Popen(
-                ['convert', '-density', str(dpi),
-                 '-quality', str(quality),
-                 '-', '-append', 'png32:-'],
-                stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE)
-            stdout, stderr = process.communicate(bin_data)
-            if stderr:
-                _logger.error('Error converting PDF to image: %s', stderr)
-            return stdout
-
-        def _convert_pdf_page_to_image(self, pdf, pagenum):
-            dst_pdf = pyPdf.PdfFileWriter()
-            dst_pdf.addPage(pdf.getPage(pagenum))
-            pdf_bytes = io.BytesIO()
-            dst_pdf.write(pdf_bytes)
-            pdf_bytes.seek(0)
-            return convert_bin_to_image(self, pdf_bytes.read())
-
-        has_synchr_param = self.env['ir.config_parameter'].get_param(
-            'document_ocr.synchronous', 'False') == 'True'
-        has_force_flag = self.env.context.get('document_ocr_force')
-        synchr = has_synchr_param or has_force_flag
-        if synchr:
-            buf = super(IrAttachment, self)._index_pdf(bin_data)
-            if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'):
-                # If we got less than 2 lines,
-                # run OCR anyway and append to existent text
-                try:
-                    f = StringIO(bin_data)
-                    pdf = pyPdf.PdfFileReader(f)
-                    if pdf.getNumPages() > 1:
-                        for pagenum in range(0, pdf.getNumPages()):
-                            _logger.info('OCR PDF "%s" page %d/%d...',
-                                         self.datas_fname,
-                                         pagenum + 1,
-                                         pdf.getNumPages())
-                            pdf_image = _convert_pdf_page_to_image(self, pdf,
-                                                                   pagenum)
-                            index_content = self._index_ocr(pdf_image)
-                            buf = u'%s\n-- %d --\n%s' % (
-                                buf, pagenum + 1, index_content.decode('utf8'))
-                    else:
-                        pdf_image = convert_bin_to_image(self, bin_data)
-                        index_content = self._index_ocr(pdf_image)
-                        buf = u'%s\n%s' % (buf, index_content.decode('utf8'))
-                except Exception as e:
-                    _logger.error('Error converting PDF to image: %s', e)
-                    pass
-        else:
-            buf = _MARKER_PHRASE
-        return buf
-
-    @api.model
-    def _ocr_cron(self):
-        for this in self.with_context(document_ocr_force=True).search(
-                [('index_content', '=', _MARKER_PHRASE)]):
-            if not this.datas:
-                continue
-            index_content = this._index(
-                this.datas.decode('base64'), this.datas_fname, this.mimetype)
-            this.write({
-                'index_content': index_content,
-            })
diff --git a/document_ocr/static/description/icon.png b/document_ocr/static/description/icon.png
deleted file mode 100644
index 3a0328b516c4980e8e44cdb63fd945757ddd132d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9455
zcmW++2RxMjAAjx~&dlBk9S+%}OXg)AGE&Cb*&}<C%<R2Kc9faym6aW`f0Dh5$js*d
z_}}Z!;XIG;_cPz`_vag-p{7Ve$Uq1H00~A(?iu(VaQlMefnU3&OozZXm@69d91cGG
z;O61r&je0NdamH#&)mKsXk?Zb_)B^>d0jUxM@u(PQx^-s)6<jB#=*|j%+$$(&(Xyy
zYgd8+09XKwoa}S2?48%%ZU#L~n^g{fE40h%YEx5by;GuJ(K|vI1uO;0m}=J7R4@Bs
z>97TX<v{QF=s?r`z`m$a0ZvrhBU7}{15RN9M`j!kv_Mp+3~{||YNuFzHNvhMYm3=Q
zo!q+OJ5(%-oNM^I^M%*luKMiVL`lo`bcKGymX7i3MIFWj1i|9+CGNvn`cu-RsK0Qh
zoYlwB>`ehR4?GS^qbkof1cslKgk<Uw6DeIxZyT_?=OwX|lwC*A?ac*gHF7jmS080$
z>U)h65qZ9Oc=ml_0temigYLJfnz{IDzUf>bGs4N!v3=Z3jMq&A#7%rM5eQ#dc?k~!
zVpnB`o+K7|Al`Q_U<UrcmRnh6jExs}l(hZs0%T~Dnpu-NENdhioRv(T{Qmv>;eD$B
zfJtP*jH`siUq~{KE)`jP2|#TUEFGRryE2`i0**z#*^6~AI|YzIWy$Cu#CSLW3q=GA
z6`?GZymC;dCPk~rBS%eCb`5OLr;RUZ;D`}um=H)BfVIq%7VhiMr)_#G0N#zrNH|__
zc+blN2UAB0=617@>_<D4$IPL<k1zq(*VmlDPer&h1n6`AG;9A!_W=N$JthhQWXZ<i
zGURc6f<i*joK3xSWaOOXxAc8ES>u;MPHN;P;N#YoE=)R#i$k_`UAA>WWCcEVMh~L_
zj--gtp&|K1#58Yz*AHCTMziU1Jzt_jG0I@qAOHsk$2}yTmVkBp_eHuY$A9)>P6o~I
z%aQ?!(GqeQ-Y+b0I(m9pwgi(IIZZzsbMv+9w{PFtd_<_(LA~0H(xz<Z(Qt1jC2cC|
z6WbMo9YgON{L#ZDl$sV4*<CP(>{=FhLB@(1&qHA5EJw1>>=%q2f&^X>IQ{!GJ4e9U
z&KlB)z(84HmNgm2hg2C0>WM{E(DdPr+EeU_N@57;PC2&DmGFW_9kP&%?X4}+xWi)(
z;)z%wI5>D4a*5XwD)P--sPkoY(a~WBw;E~AW`Yue4kFa^LM3X`8x|}ZUeMnqr}>kH
zG%WWW>3ml$Yez?i%)2pbKPI7?5o?hydokgQyZsNEr{a|mLdt;X2TX(#B1j35xPnPW
z*bMSSOauW>o;*=kO8ojw91VX!qoOQb)zHJ!odWB}d+*K?#sY_jqPdg{Sm2HdYzdEx
zOGVPhVRTGPtv0o}RfVP;Nd(|CB)<HrC1(%ZOEd8PI?r9b_$Cp-${bh59Z_R7n&YCp
zl8lfMpes*8{FVm;oJH@0LLoWmxD59u?JwHz2iRrAaE0`Hc&g;t5~$P*kdeOZn9OJ3
zRczo@ZkWU)RG+hcfH~{49Vvb3D(W0wRX#|$cA0Iqy^VR~(4hqA2AFI-rK!FM!#fJ_
zGFI@iqJOPXZ!=VjTS3dMuu~9xU3K1&?th;$)cqM#WGxbDEyB$y`#9j%>I;*t&QO8h
zFfekr30S!-LHmV_Su-W+rEwYXJ^;6&3|L$mMC8*bQptyOo9;>Qb9Q9`ySe3%V$A*9
zeKEe+b0{#KWGp$F+tga)0RtI)nhMa-K@JS}2krK~n8vJ=Ngm?R!9G<~RyuU0d?nz#
z-5EK$o(!F?hmX*2Yt6+coY`6jGbb7t<dboft~zeDZwK=+l2bFWs5^+|r{J6GO9Cwl
z&SW58BRs?XdFLuhtzl7WLbQ#~z#`jAB3AbSUg6jW6@qVd2Q~9qN(izT1y*>F#6nHA
zuKk=GGJ;ZwON1iAfG$E#Y7MnZVmrY|j0eVI(DN_MNFJmyZ|;w4tf@=CCDZ#5N_0K=
z$;R~bbk?}TpfDjfB&aiQ$VA}s?P}xPERJG{kxk5~R`iRS(SK5d+Xs9swCozZISbnS
zk!)I0>t=A<-^z(cmSFz3=jZ23u13X><0b)P)^1T_))Kr`e!-pb#q&J*Q`p+B6la%C
zuVl&0duN<;uOsB3%T9Fp8t{ED108<?)`y_~Hnd9AUX7h-H?jVuU|}My+C=TjH(jKz
zqMVr0re3S$H@t{zI95qa)+Crz*5Zj}Ao%4Z><+W(nOZd?gDnfNBC3>M8WE61$So|P
zVvqH0SNtDTcs<xiU1;=a39$d&&l5EwoIH1db#`S9Kw!YC1jhR)VbCWMptlUUkpfif
zMzi-i8)WL?|C$*Q?iqbS{w<lA9XN(rD)VS<zn>UdzaMDpT=Ty0pDHHNL@Z0w$Y`XO
z2M-_r1S+GaH%pz#Uy0*w$Vdl=X=rQXEzO}d6J^R6zjM1u&c9vYLvLp?W7w(?np9x1
zE_0JSAJCPB%i7p*Wvg)pn5T`8k3-uR?*NT|J`eS#_#54p>!p(mLDvmc-3o0mX*mp_
zN*AeS<>#^-{S%W<*mz^!X$w_2dHWpcJ6^j64qFBft-o}o_Vx80o0>}Du;>kLts;$8
zC`7q$QI(dKYG`Wa8#wl@V4jVWBRGQ@1dr-hstpQL)Tl+aqVpGpbSfN>5i&QMXfiZ>
zaA?T1VGe?rpQ@;+pkrVdd{klI&jVS@I5_iz!=UMpTsa~mBga?1r}a<Xwa$pLotZk<
zsykXwQO37I=aXew7x=}YWiW)^p)|C#g+)an!@j?EcY8C0t)BlK#y{YLtkJ6=D6H-5
zp2*ANf@>RBm1WS;TT*s0f0lY=JBl66Upy)-k4J}lh=P^8(SXk~0xW=T9v*B|gzIhN
z>qsO7dFd~mgxAy4V?&)=5ieYq?zi?ZEoj)&2o)RLy=<bK!vY7J*RP!&i_xU}i*o6o
z?xN*1<rEe1L2HBkCSgiYM3a|4w?Bo7CJL7?Eh;9An1m$1t?h1v92<Xg2(#)bjbL|o
zH_H0}!Og=1dOBynXHu>@hbCRcfT5ji<pmK_U*~T(A$I-*rM$8-BCv{=@=7F)2pdtU
z5==tp!}A*&Xgf{F>gwtQGE{L*8<@Yd{zg;CsL5mvzfDY}P-wos_6PfprFVaeqNE%h
zKZhLtcQld;ZD+>=nqN~>GvROfueSzJD&<KAVm#0W>BE*}XfU|H&(FssBqY=hPCt`d
zH?@s2>I(|;fcW&YM6#V<T#ysvE$@4oRO>#!kUIP8$Nkdh0A(bEVj``-AAyYgwY~jB
zT|I7Bf@%;7aL7Wf4dZ%VqF$eiaC38OV6oy3Z#TER2G+fOCd9Iaoy6aLYbPTN{XRPz
z;U!V|vBf%H!}52L2gH_+j;`bTcQRXB+y9onc^wLm5wi3-Be}U>k_u>2Eg$=k!(l@I
zcCg+flakT2Nej3i0yn+g+}%NYb?ta;R<sv-KjYb}UVFjITQ)VAEWu*)Lz7*-f^A*H
z<25#Ynt~-Qh?$wWx4$1=T2`j@bKp!)3IXh(LC@)%WGW$+j(tGz(6#bGw&K0j=Np@B
zt}@t$R`z(>?(g5SnwsQ49U8Wng8d|{B+lyRcEDvR3+`O{zfmrmvFrL6acVP%yG98X
zo&+VBg@px@i)%o?dG(`T;n*$S5*rnyiR#=wW}}GsAcfyQpE|>a{=$Hjg=-*_K;UtD
z<sX7(ZJc+Q;#xP8uk`jnF@a^|TWw*55if6@@}%7lOBaGo9Ia-e?`RPQc^w^EWfhe}
zHWHTvDA+r}Xf5Yqpr;QU-88%QC9F_>#z-)AXwSRY?<M%E84!g*1GC?E>OPefw^iI+
z)AXz#PfEjlwTes|_{sB?4(O@fg0AJ^g8gP}ex9Ucf*@_^J(s_5jJV}c)s$`Myn|Kd
z$<h)Fm>6>}#q^n{4vN@+Os$m7KV+`}c%4)4pv@06af4-x5#wj!KKb%caK{A&Y#Rfs
z-po?Dcb1({W=6FKIUirH&(yg=*6aLCekcKwyfK^JN5{wcA3nhO(o}SK#!CINhI`-I
z1)6&n7O&ZmyFMuNwvEic#IiOAwNkR=u5it{B9n2sAJV5pNhar=j5`*N!Na;c7g!l$
z3aYBqUkqqTJ=Re-;)s!EOeij=7SQZ3Hq}ZRds%IM*PtM$wV<GYik+VfZene%bm(n6
z`r@rc^TVw7EN{|O7rORMlw)zt(Z#enc3joE#IIk!M)L8gk^go9E9AxiP9o#OqmvV>
z@;rlc*NRK7i3y5BETSKuumEN`Xu_8<BQ)z0!-JuC8x}?$qo8SEko}oUWNI(4h*VHO
zAi!Egy!f(o9aHs2dhSE6ki(be*&w&+WLOB<(b3T_Hide(NvVvLQV{BN|2?UJFaY*@
z9CXA5B_*8i5Bf6sn~d`R>GP1Ri=OK<SGIb#BCTo3qI#UBUg+dkR+2ilUwIg%XFV;l
z+>Q$@I^ko8>H6)4rjiG5{VBM>B|%`&&s^)jS|-_95&yc=GqjNo{zFkw%%HHhS~e=s
zD#sfS+-?*t|J!+ozP6KvtOl!R)@@-z24}`9{QaVLD^9VCSR2b`b!KC#o;Ki<+wXB6
zx3&O0LOWcg4&rv4QG0)4yb}7BFSEg~=IR5<g6yi=XozU}zReXL{rA%c`$IQAs<ObZ
zep*ITZ0C(~W*`?XH^l1t7&+qigAfY9tVJ5DH!~jY>#ZRj8kg}dS7_V&^%#Do==#`u
zpy6{ox?jWuR(;pg+f@mT>#HGWHAJRRDDDv~@(IDw&R>9643kK<aUZ^OhBu;1e6t#{
zxKehVgz`HT;A`FMivG<tq0OcrXwHUX_<$j<uk%m>#HN`!1vBJHnC+RM&yIh8{gG2q
zA%e*U3|N0XSRa~oX-3EAneep)@{h2vvd3Xvy$7og(sayr@95+e6~Xvi1tUqnIxoIH
zVWo*OwYElb#uyW{Imam6f2<eMTQj#)z8+N&ZY?tSPo%&2eVNU=4=?rlO<*9Twaxco
zEVE=Jh?_x>rGbjR!Y3`#gPqkv57dB6K^wRGxc9B(t|aYDGS=m$&S!NmCtrMMaUg(c
zc2qC=2Z`EEFMW-me5B)24AqF*bV5Dr-M5ig(l-WPS%CgaPzs6p_gnCIvTJ=Y<6!gT
zVt@AfYCzjjsMEGi=rDQHo0yc;HqoRNnNFeWZgcm?f;cp(6CNylj36DoL(?TS7eU#+
z7&mfr#y))+CJOXQKUMZ7QIdS9@#-}7y2K1{8)cCt0~-X0O!O?Qx#E4Og+;A2SjalQ
zs7r?qn0H044=sDN$SRG$arw~n=+T_DNdSrarmu)V6@|?1-ZB#hRn`uilTGPJ@fqEy
zGt(f0B+^JDP&f=r{#Y_wi#AVDf-y!RIXU^0jXsFpf>=Ji*TeqSY!H~AMbJdCGLhC)
zn7Rx+sXw6uYj;WRYrLd^5IZq@6JI1C^YkgnedZEYy<&4(z%Q$5yv#B<pkvWZft^C&
z;+zNo+VJNluxaDF!`gW+F0=MxsCQ~~#CYKa;ULfj5hTa8a6;d*(<eeQ7-ZQgz2et^
zf|7URfj;x*#HiF0wuBCOTEpbeD&kkENqolCm2#cQGHCeEC<&I3j+P6?sX?BPp4~46
zx-S~EJ>oo{AH8n$<d4loB?vL3RqXwK_COrQ6xRoWGOdFnWy(hhzrG8k;2k}8Zj&>a
zhb4Y3PWdr269&?V%uI$xMcUrMzl=;w<_nm*qr=c3Rl@i5wWB;e-`t7D&c-mcQl7x!
zZWB`UGcw=Y2=}~wzrfLx=uet<;m3~=8I~ZRuzvMQUQdr+yTV|ATf1Uuomr__nDf=X
zZ3WYJtHp_ri(}SQAPjv+Y+0=<GD??Na(2AG`s>fH4krOP@S&=zZ-t1jW1o@}z;xk8
z(Nz1co&El^HK^NrhVHa-_;&88vTU>_J33=%{if;BEY*J#1n59=07jrGQ#IP>@u#3A
z;!q+E1Rj3ZJ+!4bq9F8PXJ@yMgZL;>&gYA0%_Kbi8?S=XGM~dnQZQ!yBSgcZhY96H
zrWnU;k)qy`rX&&xlDyA%(a1Hhi5CWkmg(`Gb%m(HKi-7Z!LKGRP_B8@`7&hdDy5n=
z`OIxqxiVfX@OX1p(mQu>0Ai*v_cTMiw4qRt3~N<X{IYiJ3k=4u-u*nJEBnJ<OdI8P
zXmK`OYkO5a{YDhI3PL|H4m=p>Bvr9oBy0)r>w3p~V0SCm=An6@3n)>@z!|o-$HvDK
z|3D2ZMJkLE5loMKl6R^ez@Zz%S$&mbeoqH5`Bb){Ei21q&VP)hWS2tjShfFtGE+$z
zzCR$P#uktu+#!w)cX!<osOkId_HzAT-HD2N`M+v2l+zwdW%GgZbQMuhfE*hHjKXKg
z45hphqVETPDbX6wpTlZqza0gFaRGh`3L~0id)F6#%@9;w(e%PjzuD7@inuToA!B?F
zCMLJc!u{3N)s?lB-!11!GxXsCak8zQomO)gmn02f-5~OkMYnm~#jVwwYNw@LAv!KN
z-n`q1|AXw*TW>lWN1XU%K-r=s{|j?)Akf@q#3b#{6cZCuJ~gCxuMXRmI$nGtnH+-h
z+GEi!*X=AP<|fG`1>MBdTb?28JYc=fGvAi2I<$B(rs$;eoJCyR6_bc~p!XR@O-+sD
z=eH`-ye})I5ic1eL~TDmtfJ|8`0VJ*Yr=hNCd)G1p2MMz4C3^Mj?7;!w|Ly%JqmuW
zlIEW^Ft%z?*|fpXda>Jr^1noFZEwFgVV%|*XhH@acv8rdGxeEX{M$(vG{Zw+x(ei@
zmfXb22}8-?Fi`vo-YVrTH*C?a8%M=Hv9MqVH7H^J$KsD?>!SFZ;ZsvnHr_gn=7acz
z#W?0eCdVhVMWN12VV^$>WlQ?f;P^{(&pYTops|btm6aj>_Uz+hqpGwB)vWp0Cf5y<
zft8-je~nn?W11plq}N)4A{l8I7$!ks_x$PXW-2XaRFswX_BnF{R#6YIwMhAgd5F9X
zGmwdadS6(a^fjHtXg8=l?Rc0Sm%hk6E9!5cLVloEy4eh(=FwgP`)~I^5~pBEWo+F6
zSf2ncyMurJN91#cJTy_u8Y}@%!bq1RkGC~-bV@SXRd4F{R-*V<h953|jk-C&|3)z%
ze&_30-6q1)a0(!xXqA+_t(WC`HA_yYaW@>`bS+6;W5vZ(&+I<9$;-V|eNfLa5n-6%
z2(}&uGRF;p9<Q%jdNy1%e7XTzyu7EEQT(5LrnvX~T%K!IuDyHY`ZneVJu#k_1T)xA
z*yxB?y4!pOJ$DTZzBm|8OIQq+AtV25aJ+X5EJjAu><tfuiNClI2BSoMgqLnU-5t95
zEnZ(^g~1RgD=UMB({c;$HujG&%DqGF;5jI~roRT+(rOoS$6f6SsURIuAVkAeIVc~{
z5ZxLN%m%Z*Sg;qYCf3<$dE6~&w_!EBx%yl4YC~LH{FCFNRBc_I>2eS*sE*o<YSPrx
z{M(reV{~jKue#Y6ZOnqJia{fQc!UxV4m)l389OmzR6A3|2!i<P{r82jKw+zq4%@ny
znopi6g!1Vx9Bml#a~KdL2lp2C)qSTKIh43vgycQHfQb_I!kQY&p;X@Bz|{^SXsW1K
zP&Ag1CW_r6u5-4=s(W>R$@pexaqr*meB)VhmIg@h{uzkk$9~qh#cHhw#>O%)b@+(|
z^IQgqzuj~Sk(J;swEM-3TrJAPCq9k^^^`q{IItKBRXYe}e0Tdr=Huf7da3$l4<V=<
zfr<*%iOB1EY}w3tF2Cwl7a2OS&~Y-lu<s)T^9=OFU8(CeN|63BiMxf*)5gesGU<eZ
z9DigzL3+quZ1o2T<KAZbCGJx&lrl*e#}Dpv24bZO$B<Yoc5hbeP0y?@P%|Tvw||e%
zV#e^q0D2R_Oq_c+=x5vP&hg4k+df{o7$aNZCM>PdpwWDop%^}n;dD#K4s#DYA8SHZ
z&1!riV4W4R7R#C))JH1~axJ)RYnM$$lIR%6fIVA@zV{XVyx}C+a-Dt8Y9M)^KU0+H
zR4IUb2CJ{Hg>CuaXtD50jB(_Tcx=Z$^W<wsNa}nUDpNk$q{>Yu2u5kubqmwp%drJ6
z?Fo40g!Qd<-l=TQxqHEOuPX0;^z7iX?Ke^a%XT<13TA^5`4Xcw6D@Ur&VT&CUe0d}
z1GjOVF1^L@>O)l@?bD~$wzgf(nxX1OGD8fEV?TdJcZc2KoUe|oP1#=$$7ee|xbY)A
zDZq+cuTpc(fFdj^=!;{k03C69lMQ(|>uhRfRu%+!k&<F<Z7l=VnwB^RA&^APMi=Tn
zNc}%IJL~xB4OP6bJNwAw)~Ma2=UP0dhr%X=kco(it+@F<#$xrIJ8@|{Buh<)h`xg?
z_U}pe=3#zmDfdqE4`Hyn<!?&CfCp#GTeXo8;AYd1Opd&cXER8cBSOF3iFJ#XPgQVp
zZSiQf7V^Dt?ITs8aMF~e1hq0P@^oB)#byh|6q_7F1B%ST%WL~J?ySn>YOi-3|1QKB
z<?_cUy)V3go%%^l)lRa=dmY_XQG3Z{Q?52d$qG}vIe|EZbYEtrR$rh(iS)O#dU-(>
z?n?eq1XP>p-IM$Z^C;2L3itnbJZAip*Zo0aw2bs8@(s^~*8T9go!%dHcAz2lM;`yp
zD=7&xjFV$S&5uDaiScyD?B-i1ze`+CoRtz`Wn+Zl&#<i<Zx(`=7k~{%`w&<EbG}U<
z0%E^8lyw>s4&}MO{@N!ufrzjG$B79)Y2d3tBk&)TxUTw@<TSeYlROCt(gU?O((-qu
zs>QS0TEL_?njX|<LXnSC4TlId(D4W|GQ?L@$3Ue@N8p=l9-sC<=z(lPk;^sT(v2*k
zc~vp#Hp`mXjzhml2NMCh^h5)sqsan+jJ_l<a4w|G&ac02hrz8#6|kFraA~rta0}!q
zB4BE{QWY7MtxHn_xB);Avf#Lf<GG<A?Q3JVyc1p8^H}$8(Gn=_&F1!m8$sKyeue+L
z5&392wm+wO;_oXoTEJ=wxVXk}dt<d~CgUUMW_PPTe(c<7n18#mVaX)vK}-PzZq7<b
zNJbVTFaq(8ZLx|A*G$H3ugT2aVziDEGa66FVt@hr1|D{RWN6yuqlglM!rp^YG;UpG
zrnm?ek079l3VoOU^p%f^A9Yzn8IVX?^rB4LbgJ|PONhzM_0{P?S(V$OI=u5IBjfT#
z0ntLLnhg5$0fAHJaG6HCx4X7;RmvMtE}8C>@vq?Uz(nBFK5Pq7*xj#u*R&i|?7+6#
z+|r_n#SW&LXhtheZdah{ZVoqwyT{D>MC3nkFF#N)xLi{p7J1jXlmVeb;cP5?e(=f#
zuT7fv<Q3o5LspCx-RPkzLw{VsDE>jSbjS781v?7{)-X3*?>tq?)Yd)~|1{BDS(pqC
zC}~H#WXlkUW*H5CDOo<)#x7%RY)A;ShGhI5s*#cRDA8YgqG(HeKDx+#(ZQ?386dv!
zlXCO)w91~Vw4AmOcATuV653fa9R$fyK8<JV*@W33SMHM;w!OmUXar{O;_8j>ul%rG
z-<zwGD)!Y{+(T{%ob~79zpXX%zulyiy1_UHWvu@YqxKAK3e9GO6Os4R0Naujn>wfS
zihugoZyr38Im?Zuh6@RcF~t1anQu7>#lPpb#}4cOA!EM11`%f*07RqOVkmX{p~KJ9
z^zP;K#|)$`^Rb{rnH<AdQ^(;gQMcF>GH{~>1(fawV0*Z#)}M`m8-?ZJV<+e}s9wE#
z)l&az?w^5{)`S(%MRzxdNqrs1n*-=jS^_jqE*5XDrA0+VE`5^*p3CuM<&dZEeCjoz
zR;uu_H9ZPZV|fQq`Cyw4nscrVwi!fE6ciMmX$!_hN7uF;jjKG)d2@aC4ropY)8<!P
zAOHj?oPbjQ%hh|vE_1IMB)43eQpeLi&)R>etW=xJvni)8eHi`H$%#zn^WJ<U7gogJ
z?A^!2J~rI78JH|Ml2EldmKY5K8;Zx(FGcBdM<5oe#G+nd6dObqz@Af%%M*aBE_pn8
zXQo2`Bw*IwvP3#1Ev<%YocpBodO9+Rw~`5*CY3{L;qf1PxV!r%NI+#Q1f8GU7$~#U
zAA9$4&g-k=8BYi*3i@0^UX}nTQVyOf)8T(}G^Ti?Vqvk~bJRR#EAQ?u`dClPxk@{;
zxvO?%jSX`2{8|^z0*C5DR1Z^>5NLc-rqk|u&&4Z6fD_m&JfSI1Bvb?b<*n&sfl0^t
z=HnmRl`XrFvMKB%9}>PaA`m-fK6a0(8=qPkWS5bb4=v?XcWi&hRY?O5HdulRi4?fN
zlsJ*N-0Qw+Yic@s0(2uy%F@ib;GjXt01F<S%GT3P{Ck&Y*^gWuie`o_g+ZNDNIV|F
z)zEe|Ij*4$H1(<RLz0($;AD3VsUJwI@liw^?fh(V?VC`Sz7h`*Q<VYlhbHJ?&h+!W
zAJC)U;Lx_QRaSNFYbyi|7+MeNTgA+Znh}qFzf>mx5XbRo6+n|pP(&nodMoap^z{~q
ziEeaUT@Mxe3vJSfI6?uLND(CNr=#^W<1b}jzW58bIfyWTDle$mmS(|x-0|2UlX+9k
zQ<WB5+u#`K#~J$81)#?BuTOKLk|+S>^EX7Nw}?EzVoBfT(-LT|=9N@^hcn-_p&sqG
z&*oVs2JSU+N4ZD`FhCAWaS;>|wH2G*Id|?pa#@>tyxX`+4HyIArWDvVrX)2WAOQff
z0qyHu&-S@i^MS-+j--!pr4fPBj~_8({~e1bfcl0wI1kaoN>mJL6KUPQm5N7lB(ui1
zE-o%kq)&djzWJ}ob<-GfDlkB;F31j-VHKvQUGQ3sp`CwyGJk_i!y^sD0fqC@$9|jO
zOqN!r!8-p==F@ZVP=U$qSpY(gQ0)59P1&t@y?5rvg<}E+GB}2<F#{*|k0E{$&_`~)
zkzDcsi#!7r<a8lPUCMj?{CK;e|9#-xj>6NYPp4f2YFQrQtot5mn3wu_qprZ=>Ig-$
zbW26Ws~IgY>}^5w`vTB(G`PTZaDiGBo5o(tp)qli|NeV(<Rzgqw(Ze!7hEGKKx((>
z@H_=R8V39rt5J5YB2Ky?4eJJ#b`_iBe2ot~6%7mLt5t8Vwi^Jy7|jWXqa3amOIoRb
zOr}WVFP--DsS`1WpN%~)t3R!arKF^Q$e12KEqU36AWwnCBICpH4XCsfnyrHr>$I$4
z!DpKX$OKLWarN7nv@!uIA+~RNO)l$$w}p(;b>mx8pwYvu;dD_unryX_N<mojSrG!`
zgkqx4t<XLL6ZUppvjeV9PJ6dG>hT8*Tj>BTrTTL&!?O+%Rv;b?B??gSzdp?6Uug9{
zd@V08Z$BdI?fpoCS$)t4mg4rT8Q_I}h`0d-vYZ^|dOB*Q^S|xqTV*<bTMtKO;_Z)R
zRSOJrd5TFO0aO%#%-sNa{`SiQ^$$1^@oUd&^lB{MKZ>vIg?@fVFSmMpaw0qtTRbx}
z({Pg?#{2`sc9)M5N$*N|4;^t$+Q<Wh^yJ?FzJ*$wiB{j;CMy+?m1MbsJo0ubR==f9
z>P?#mo<zt1E6=Ilm*nbmYmpxfAj7*m*Wh@=7|;!%(-pviW`nuCktLveet9^$*y^>v
zGVC@I*lBVrOU-%2y!7%)fAKjpEFsgQc4{amtiHb95KQEwvf<(3T<9-Zm$xIew#P22
zc2Ix|App^>v6(3L_MCU0d3W##AB<Fxl*nmny6_RsCu%1a)so}}uCXSzwY70Y@uTxK
z=5nu(N*2HDbrIcL)t_*{*84mvmV_Y9<vxHJJ;0gUdNjyW)jDb|@|j*qR8;gsMa5Ir
z02fHv=%xyN9VLv_ZLL4y|F*p$S^^T47m{aok5{rmM{$s7^Xu2!_fo1$IG6kkG_Tgx
zFfjPm3;g>0M~3D00EWoKZqsJYT(#@w$Y_H7G22M~ApVFTRHMI_3be)Lkn#0F*V8Pq
zc}`Cjy$bE;FJ6H7p=0y#R>`}-m4(0F>%@P|?7fx{=R^uFdISRnZ2W_xQhD{YuR3t<
z{6yxu=4~JkeA;|(J6_nv#>Nvs&FuLA&PW^he@t(UwFFE8)|a!R{`E`K`i^ZnyE4$k
z;(749Ix|oi$c3QbEJ3b~D_kQsPz~fIUKym($a_7dJ?o+40*OLl^{=&oq$<#Q(yyrp
z{J-FAniyAw9tPbe&IhQ|a`DqFTVQGQ&Gq3!C2==4x{6EJwiPZ8zub-iXoUtkJiG{}
zPaR&}_fn8_z~(=;5lD-aPWD3z8PZS@AaUiomF!G8I}Mf>e~0g#BelA-5#`cj;O5>N
Xviia!U7SGha1wx#SCgwmn*{w2TRX*I

diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py
deleted file mode 100644
index 7efb2857..00000000
--- a/document_ocr/tests/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# -*- coding: utf-8 -*-
-# © 2016 Therp BV <http://therp.nl>
-# © 2017 ThinkOpen Solutions <https://tkobr.com>
-# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
-from . import test_document_ocr
diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py
deleted file mode 100644
index 1d1a5490..00000000
--- a/document_ocr/tests/test_document_ocr.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# -*- coding: utf-8 -*-
-# © 2016 Therp BV <http://therp.nl>
-# © 2017 ThinkOpen Solutions <https://tkobr.com>
-# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
-from StringIO import StringIO
-
-from PIL import Image, ImageDraw, ImageFont
-from PIL import PdfImagePlugin, PalmImagePlugin # noqa # pylint: disable=unused-import
-from odoo.tests.common import TransactionCase
-
-from ..models.ir_attachment import _MARKER_PHRASE
-
-
-class TestDocumentOcr(TransactionCase):
-    def test_document_ocr(self):
-        self.env['ir.config_parameter'].set_param(
-            'document_ocr.synchronous', 'True')
-        test_image = Image.new('RGB', (200, 30))
-        draw = ImageDraw.Draw(test_image)
-        draw.text((3, 3), "Hello world", font=ImageFont.truetype(
-            '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
-        # test a plain image
-        data = StringIO()
-        test_image.save(data, 'png')
-        attachment = self.env['ir.attachment'].create({
-            'name': 'testattachment',
-            'datas_fname': 'test_png.pdf'})
-        result = attachment._index(
-            data.getvalue(), 'test.png', None)
-        self.assertEqual(result.strip(), 'Hello world')
-        # should also work for pdfs
-        data = StringIO()
-        test_image.save(data, 'pdf', resolution=300)
-        attachment = self.env['ir.attachment'].create({
-            'name': 'testattachment',
-            'datas_fname': 'test_pdf.pdf'})
-        result = attachment._index(
-            data.getvalue(), 'test.pdf', None)
-        self.assertEqual(result.strip(), 'Hello world')
-        # check cron
-        self.env['ir.config_parameter'].set_param(
-            'document_ocr.synchronous', 'False')
-        attachment = self.env['ir.attachment'].create({
-            'name': 'testattachment',
-            'datas_fname': 'test_cron.pdf',
-            'datas': data.getvalue().encode('base64'),
-        })
-        self.assertEqual(attachment.index_content, _MARKER_PHRASE)
-        attachment._ocr_cron()
-        self.assertEqual(attachment.index_content.strip(), 'Hello world')
-        # and for an unreadable image, we expect an empty string
-        self.env['ir.config_parameter'].set_param(
-            'document_ocr.synchronous', 'True')
-        data = StringIO()
-        test_image = Image.new('1', (200, 30))
-        test_image.save(data, 'palm')
-        attachment = self.env['ir.attachment'].create({
-            'name': 'testattachment',
-            'datas_fname': 'test_err.palm'})
-        result = attachment._index(
-            data.getvalue(), 'test.palm', None)
-        self.assertEqual(result, '')
diff --git a/document_ocr/views/ir_attachment_view.xml b/document_ocr/views/ir_attachment_view.xml
deleted file mode 100644
index ed171d61..00000000
--- a/document_ocr/views/ir_attachment_view.xml
+++ /dev/null
@@ -1,43 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<odoo>
-    <!-- Attachment -->
-    <record id="view_attachment_form" model="ir.ui.view">
-        <field name="model">ir.attachment</field>
-        <field name="inherit_id" ref="base.view_attachment_form"/>
-        <field name="arch" type="xml">
-            <xpath expr="(//sheet/group/group)[last()]" position="attributes">
-                <attribute name="invisible">1</attribute>
-            </xpath>
-            <xpath expr="(//sheet/group/group)[last()]" position="before">
-                <group groups="base.group_no_one" string="Indexed Content" colspan="4">
-                <field name="index_content_rel" readonly="1" nolabel="1"/>
-                </group>
-            </xpath>
-            <field name="mimetype" position="after">
-                <field name="store_fname" invisible="1"/>
-                <field name="language"/>
-            </field>
-        </field>
-    </record>
-    <record id="view_attachment_tree" model="ir.ui.view">
-        <field name="model">ir.attachment</field>
-        <field name="inherit_id" ref="base.view_attachment_tree"/>
-        <field name="arch" type="xml">
-            <field name="type" position="after">
-                <field name="language"/>
-            </field>
-        </field>
-    </record>
-    <record id="view_attachment_search" model="ir.ui.view">
-        <field name="model">ir.attachment</field>
-        <field name="inherit_id" ref="base.view_attachment_search"/>
-        <field name="arch" type="xml">
-            <field name="name" position="after">
-                <field name="language"/>
-            </field>
-            <filter name="owner" position="after">
-                <filter string="Language" domain="[]" context="{'group_by':'language'}" groups="base.group_no_one"/>
-            </filter>
-        </field>
-    </record>
-</odoo>