[MIG] Migration of document_ocr module to 10.0

2025-07-26 18:38:41 -06:00 · 2017-06-01 20:03:58 +01:00 · 2017-06-01 20:03:58 +01:00 · a58c40621c
commit a58c40621c
parent 1705cefe6b
9 changed files with 293 additions and 50 deletions
--- a/document_ocr/README.rst
+++ b/document_ocr/README.rst
@ -39,6 +39,21 @@ This is because the recognition process takes a while and you don't want to make
 The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
 In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
 By default, recognition language is set to english.
 In case you want to use a different default, set configuration parameter ``document_ocr.language`` to value respective value ex:``por``, for Portuguese.
 In PDF case, OCR will run after it will be converted to an image. But OCR will be applied to all PDFs.
 System parameters used:
 #``document_ocr.synchronous``:  bool
 #``document_ocr.language``:  string
 #``document_ocr.dpi``:  integer
 #``document_ocr.quality``:  integer
 .. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
    :alt: Try me on Runbot
    :target: https://runbot.odoo-community.org/runbot/118/10.0
--- a/document_ocr/init.py
+++ b/document_ocr/init.py
@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
 # © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from . import models
--- a/document_ocr/manifest.py
+++ b/document_ocr/manifest.py
@ -2,9 +2,9 @@
 # © 2016 Therp BV <http://therp.nl>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 {
-    "name": "OCR for documents",
+    "name": "OCR for Documents",
    "version": "10.0.1.0.0",
-    "author": "Therp BV,Odoo Community Association (OCA), TKO Brasil",
+    "author": "Therp BV, Odoo Community Association (OCA), ThinkOpen Solutions Brasil",
    "license": "AGPL-3",
    "category": "Knowledge Management",
    "summary": "Run character recognition on uploaded files",
@ -14,10 +14,12 @@
    "data": [
        "data/ir_cron.xml",
        "data/ir_config_parameter.xml",
        "views/ir_attachment_view.xml",
    ],
    "external_dependencies": {
        'bin': [
            'tesseract',
            'convert',
        ],
    },
 }
--- a/document_ocr/data/ir_config_parameter.xml
+++ b/document_ocr/data/ir_config_parameter.xml
@ -9,5 +9,13 @@
            <field name="key">document_ocr.dpi</field>
            <field name="value">300</field>
        </record>
        <record id="param_quality" model="ir.config_parameter">
            <field name="key">document_ocr.quality</field>
            <field name="value">100</field>
        </record>
        <record id="param_language" model="ir.config_parameter">
            <field name="key">document_ocr.language</field>
            <field name="value">eng</field>
        </record>
    </data>
 </openerp>
--- a/document_ocr/models/init.py
+++ b/document_ocr/models/init.py
@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
 # © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from . import ir_attachment
--- a/document_ocr/models/ir_attachment.py
+++ b/document_ocr/models/ir_attachment.py
@ -1,85 +1,256 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
 # © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 import io
 import logging
 import subprocess
 from StringIO import StringIO
-from PIL import Image
+import pyPdf
-from openerp import api, models
+from odoo import api, fields, models
 from odoo.exceptions import UserError
 _logger = logging.getLogger(__name__)
 _MARKER_PHRASE = '[[waiting for OCR]]'
 OCR_LANGUAGE = [('afr', 'Afrikaans'),
                ('amh', 'Amharic'),
                ('ara', 'Arabic'),
                ('asm', 'Assamese'),
                ('aze', 'Azerbaijani'),
                ('aze_cyrl', 'Azerbaijani - Cyrilic'),
                ('bel', 'Belarusian'),
                ('ben', 'Bengali'),
                ('bod', 'Tibetan'),
                ('bos', 'Bosnian'),
                ('bul', 'Bulgarian'),
                ('cat', 'Catalan; Valencian'),
                ('ceb', 'Cebuano'),
                ('ces', 'Czech'),
                ('chi_sim', 'Chinese - Simplified'),
                ('chi_tra', 'Chinese - Traditional'),
                ('chr', 'Cherokee'),
                ('cym', 'Welsh'),
                ('dan', 'Danish'),
                ('dan_frak', 'Danish - Fraktur'),
                ('deu', 'German'),
                ('deu_frak', 'German - Fraktur'),
                ('dzo', 'Dzongkha'),
                ('ell', 'Greek, Modern (1453-)'),
                ('eng', 'English'),
                ('enm', 'English, Middle (1100-1500)'),
                ('epo', 'Esperanto'),
                ('equ', 'Math / equation detection module'),
                ('est', 'Estonian'),
                ('eus', 'Basque'),
                ('fas', 'Persian'),
                ('fin', 'Finnish'),
                ('fra', 'French'),
                ('frk', 'Frankish'),
                ('frm', 'French, Middle (ca.1400-1600)'),
                ('gle', 'Irish'),
                ('glg', 'Galician'),
                ('grc', 'Greek, Ancient (to 1453)'),
                ('guj', 'Gujarati'),
                ('hat', 'Haitian; Haitian Creole'),
                ('heb', 'Hebrew'),
                ('hin', 'Hindi'),
                ('hrv', 'Croatian'),
                ('hun', 'Hungarian'),
                ('iku', 'Inuktitut'),
                ('ind', 'Indonesian'),
                ('isl', 'Icelandic'),
                ('ita', 'Italian'),
                ('ita_old', 'Italian - Old'),
                ('jav', 'Javanese'),
                ('jpn', 'Japanese'),
                ('kan', 'Kannada'),
                ('kat', 'Georgian'),
                ('kat_old', 'Georgian - Old'),
                ('kaz', 'Kazakh'),
                ('khm', 'Central Khmer'),
                ('kir', 'Kirghiz; Kyrgyz'),
                ('kor', 'Korean'),
                ('kur', 'Kurdish'),
                ('lao', 'Lao'),
                ('lat', 'Latin'),
                ('lav', 'Latvian'),
                ('lit', 'Lithuanian'),
                ('mal', 'Malayalam'),
                ('mar', 'Marathi'),
                ('mkd', 'Macedonian'),
                ('mlt', 'Maltese'),
                ('msa', 'Malay'),
                ('mya', 'Burmese'),
                ('nep', 'Nepali'),
                ('nld', 'Dutch; Flemish'),
                ('nor', 'Norwegian'),
                ('ori', 'Oriya'),
                ('osd', 'Orientation and script detection module'),
                ('pan', 'Panjabi; Punjabi'),
                ('pol', 'Polish'),
                ('por', 'Portuguese'),
                ('pus', 'Pushto; Pashto'),
                ('ron', 'Romanian; Moldavian; Moldovan'),
                ('rus', 'Russian'),
                ('san', 'Sanskrit'),
                ('sin', 'Sinhala; Sinhalese'),
                ('slk', 'Slovak'),
                ('slk_frak', 'Slovak - Fraktur'),
                ('slv', 'Slovenian'),
                ('spa', 'Spanish; Castilian'),
                ('spa_old', 'Spanish; Castilian - Old'),
                ('sqi', 'Albanian'),
                ('srp', 'Serbian'),
                ('srp_latn', 'Serbian - Latin'),
                ('swa', 'Swahili'),
                ('swe', 'Swedish'),
                ('syr', 'Syriac'),
                ('tam', 'Tamil'),
                ('tel', 'Telugu'),
                ('tgk', 'Tajik'),
                ('tgl', 'Tagalog'),
                ('tha', 'Thai'),
                ('tir', 'Tigrinya'),
                ('tur', 'Turkish'),
                ('uig', 'Uighur; Uyghur'),
                ('ukr', 'Ukrainian'),
                ('urd', 'Urdu'),
                ('uzb', 'Uzbek'),
                ('uzb_cyrl', 'Uzbek - Cyrilic'),
                ('vie', 'Vietnamese'),
                ('yid', 'Yiddish'), ]
 class IrAttachment(models.Model):
    _inherit = 'ir.attachment'
    language = fields.Selection(OCR_LANGUAGE, 'Language')
    # We need to redefine index_content field to be able to update it
    # on the onchange_language()
    index_content = fields.Text('Indexed Content', readonly=False, prefetch=False)
    index_content_rel = fields.Text(related='index_content', string='Indexed Content Rel')
    @api.onchange('language')
    def onchange_language(self):
        process = subprocess.Popen(['tesseract', '--list-langs'],
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        if self.language not in stderr.split('\n'):
            raise UserError(
                "Language not installed."
                " Please ask your system administrator to"
                " install tesseract '%s' language." %
                self.language)
        if self.store_fname:
            bin_data = self._file_read(self.store_fname)
        else:
            bin_data = self.db_datas
        index_content = self._index(
            bin_data.decode('base64'), self.datas_fname, self.mimetype)
        return {'value': {
            'index_content': index_content}}
    @api.model
-    def _index(self, data, datas_fname, file_type):
+    def _index(self, bin_data, datas_fname, mimetype):
-        mimetype, content = super(IrAttachment, self)._index(
+        if not self.language:
-            data, datas_fname, file_type)
+            # Set default language
            self.language = self.env['ir.config_parameter'].get_param(
                'document_ocr.language', 'eng')
        content = super(IrAttachment, self)._index(
            bin_data, datas_fname, mimetype)
        if not content or content == 'image':
            has_synchr_param = self.env['ir.config_parameter'].get_param(
                'document_ocr.synchronous', 'False') == 'True'
            has_force_flag = self.env.context.get('document_ocr_force')
-            if has_synchr_param or has_force_flag:
+            synchr = has_synchr_param or has_force_flag
-                content = self._index_ocr(mimetype, data, datas_fname,
+            if synchr:
-                                          file_type)
+                content = self._index_ocr(bin_data)
            else:
                content = _MARKER_PHRASE
        return content
-        return mimetype, content
+    def _index_ocr(self, bin_data):
    @api.model
    def _index_ocr(self, mimetype, data, datas_fname, file_type):
        dpi = int(
            self.env['ir.config_parameter'].get_param(
                'document_ocr.dpi', '500'))
        top_type, sub_type = mimetype.split('/', 1)
        if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
            image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
                data, datas_fname, file_type, dpi)
        else:
            image_data = StringIO()
            try:
                Image.open(StringIO(data)).save(image_data, 'tiff',
                                                dpi=(dpi, dpi))
            except IOError:
                _logger.exception('Failed to OCR image')
                return None
        process = subprocess.Popen(
-            ['tesseract', 'stdin', 'stdout'],
+            ['tesseract', 'stdin', 'stdout', '-l', self.language],
            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
-        stdout, stderr = process.communicate(image_data.getvalue())
+        stdout, stderr = process.communicate(bin_data)
        if stderr:
            _logger.error('Error during OCR: %s', stderr)
        return stdout
-    @api.model
+    def _index_pdf(self, bin_data):
-    def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
+
-        process = subprocess.Popen(
+        def convert_bin_to_image(self, bin_data):
-            ['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
+            dpi = int(self.env['ir.config_parameter'].get_param(
-            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+                'document_ocr.dpi', '500'))
-            stderr=subprocess.PIPE,
+            quality = int(self.env['ir.config_parameter'].get_param(
-        )
+                'document_ocr.quality', '100'))
-        stdout, stderr = process.communicate(data)
+            process = subprocess.Popen(
-        if stderr:
+                ['convert', '-density', str(dpi),
-            _logger.error('Error converting to PDF: %s', stderr)
+                 '-quality', str(quality),
-        return StringIO(stdout)
+                 '-', '-append', 'png32:-'],
                stdin=subprocess.PIPE, stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            stdout, stderr = process.communicate(bin_data)
            if stderr:
                _logger.error('Error converting PDF to image: %s', stderr)
            return stdout
        def _convert_pdf_page_to_image(self, pdf, pagenum):
            dst_pdf = pyPdf.PdfFileWriter()
            dst_pdf.addPage(pdf.getPage(pagenum))
            pdf_bytes = io.BytesIO()
            dst_pdf.write(pdf_bytes)
            pdf_bytes.seek(0)
            return convert_bin_to_image(self, pdf_bytes.read())
        has_synchr_param = self.env['ir.config_parameter'].get_param(
            'document_ocr.synchronous', 'False') == 'True'
        has_force_flag = self.env.context.get('document_ocr_force')
        synchr = has_synchr_param or has_force_flag
        if synchr:
            buf = super(IrAttachment, self)._index_pdf(bin_data)
            if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'):
                # If we got less than 2 lines, run OCR and append to existent text
                try:
                    f = StringIO(bin_data)
                    pdf = pyPdf.PdfFileReader(f)
                    if pdf.getNumPages() > 1:
                        for pagenum in range(0, pdf.getNumPages()):
                            _logger.info('OCR PDF "%s" page %d/%d...',
                                         self.datas_fname,
                                         pagenum + 1,
                                         pdf.getNumPages())
                            pdf_image = _convert_pdf_page_to_image(self, pdf,
                                                                   pagenum)
                            index_content = self._index_ocr(pdf_image)
                            buf = u'%s\n-- %d --\n%s' % (
                                buf, pagenum + 1, index_content.decode('utf8'))
                    else:
                        _logger.info('OCR PDF "%s"...', self.datas_fname)
                        pdf_image = convert_bin_to_image(self, bin_data)
                        index_content = self._index_ocr(pdf_image)
                        buf = u'%s\n%s' % (buf, index_content.decode('utf8'))
                except Exception as e:
                    _logger.error('Error converting PDF to image: %s', e)
                    pass
        else:
            buf = _MARKER_PHRASE
        return buf
    @api.model
    def _ocr_cron(self):
-        for this in self.with_context(document_ocr_force=True).search([
+        for this in self.with_context(document_ocr_force=True).search(
-            ('index_content', '=', _MARKER_PHRASE),
+                [('index_content', '=', _MARKER_PHRASE)]):
        ]):
            if not this.datas:
                continue
-            file_type, index_content = this._index(
+            index_content = this._index(
-                this.datas.decode('base64'), this.datas_fname, this.file_type)
+                this.datas.decode('base64'), this.datas_fname, this.mimetype)
            this.write({
                'file_type': file_type,
                'index_content': index_content,
            })
--- a/document_ocr/tests/init.py
+++ b/document_ocr/tests/init.py
@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
 # © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from . import test_document_ocr
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@ -1,11 +1,12 @@
 # -*- coding: utf-8 -*-
 # © 2016 Therp BV <http://therp.nl>
 # © 2017 ThinkOpen Solutions <https://tkobr.com>
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 from StringIO import StringIO
 from PIL import Image, ImageDraw, ImageFont
-from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
+from odoo.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
-from openerp.tests.common import TransactionCase
+from odoo.tests.common import TransactionCase
 class TestDocumentOcr(TransactionCase):
--- a/document_ocr/views/ir_attachment_view.xml
+++ b/document_ocr/views/ir_attachment_view.xml
@ -0,0 +1,43 @@
 <?xml version="1.0" encoding="utf-8"?>
 <odoo>
    <!-- Attachment -->
    <record id="view_attachment_form" model="ir.ui.view">
        <field name="model">ir.attachment</field>
        <field name="inherit_id" ref="base.view_attachment_form"/>
        <field name="arch" type="xml">
            <xpath expr="(//sheet/group/group)[last()]" position="attributes">
                <attribute name="invisible">1</attribute>
            </xpath>
            <xpath expr="(//sheet/group/group)[last()]" position="before">
                <group groups="base.group_no_one" string="Indexed Content" colspan="4">
                <field name="index_content_rel" readonly="1" nolabel="1"/>
                </group>
            </xpath>
            <field name="mimetype" position="after">
                <field name="store_fname" invisible="1"/>
                <field name="language"/>
            </field>
        </field>
    </record>
    <record id="view_attachment_tree" model="ir.ui.view">
        <field name="model">ir.attachment</field>
        <field name="inherit_id" ref="base.view_attachment_tree"/>
        <field name="arch" type="xml">
            <field name="type" position="after">
                <field name="language"/>
            </field>
        </field>
    </record>
    <record id="view_attachment_search" model="ir.ui.view">
        <field name="model">ir.attachment</field>
        <field name="inherit_id" ref="base.view_attachment_search"/>
        <field name="arch" type="xml">
            <field name="name" position="after">
                <field name="language"/>
            </field>
            <filter name="owner" position="after">
                <filter string="Language" domain="[]" context="{'group_by':'language'}" groups="base.group_no_one"/>
            </filter>
        </field>
    </record>
 </odoo>