[MIG] Migration of document_ocr module to 10.0

This commit is contained in:
Carlos Almeida 2017-06-01 20:03:58 +01:00
parent 1705cefe6b
commit a58c40621c
9 changed files with 293 additions and 50 deletions

View File

@ -39,6 +39,21 @@ This is because the recognition process takes a while and you don't want to make
The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
By default, recognition language is set to english.
In case you want to use a different default, set configuration parameter ``document_ocr.language`` to value respective value ex:``por``, for Portuguese.
In PDF case, OCR will run after it will be converted to an image. But OCR will be applied to all PDFs.
System parameters used:
#``document_ocr.synchronous``: bool
#``document_ocr.language``: string
#``document_ocr.dpi``: integer
#``document_ocr.quality``: integer
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas .. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
:alt: Try me on Runbot :alt: Try me on Runbot
:target: https://runbot.odoo-community.org/runbot/118/10.0 :target: https://runbot.odoo-community.org/runbot/118/10.0

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# © 2017 ThinkOpen Solutions <https://tkobr.com>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import models from . import models

View File

@ -2,9 +2,9 @@
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{ {
"name": "OCR for documents", "name": "OCR for Documents",
"version": "10.0.1.0.0", "version": "10.0.1.0.0",
"author": "Therp BV,Odoo Community Association (OCA), TKO Brasil", "author": "Therp BV, Odoo Community Association (OCA), ThinkOpen Solutions Brasil",
"license": "AGPL-3", "license": "AGPL-3",
"category": "Knowledge Management", "category": "Knowledge Management",
"summary": "Run character recognition on uploaded files", "summary": "Run character recognition on uploaded files",
@ -14,10 +14,12 @@
"data": [ "data": [
"data/ir_cron.xml", "data/ir_cron.xml",
"data/ir_config_parameter.xml", "data/ir_config_parameter.xml",
"views/ir_attachment_view.xml",
], ],
"external_dependencies": { "external_dependencies": {
'bin': [ 'bin': [
'tesseract', 'tesseract',
'convert',
], ],
}, },
} }

View File

@ -9,5 +9,13 @@
<field name="key">document_ocr.dpi</field> <field name="key">document_ocr.dpi</field>
<field name="value">300</field> <field name="value">300</field>
</record> </record>
<record id="param_quality" model="ir.config_parameter">
<field name="key">document_ocr.quality</field>
<field name="value">100</field>
</record>
<record id="param_language" model="ir.config_parameter">
<field name="key">document_ocr.language</field>
<field name="value">eng</field>
</record>
</data> </data>
</openerp> </openerp>

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# © 2017 ThinkOpen Solutions <https://tkobr.com>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import ir_attachment from . import ir_attachment

View File

@ -1,85 +1,256 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# © 2017 ThinkOpen Solutions <https://tkobr.com>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import io
import logging import logging
import subprocess import subprocess
from StringIO import StringIO from StringIO import StringIO
from PIL import Image import pyPdf
from openerp import api, models from odoo import api, fields, models
from odoo.exceptions import UserError
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
_MARKER_PHRASE = '[[waiting for OCR]]' _MARKER_PHRASE = '[[waiting for OCR]]'
OCR_LANGUAGE = [('afr', 'Afrikaans'),
('amh', 'Amharic'),
('ara', 'Arabic'),
('asm', 'Assamese'),
('aze', 'Azerbaijani'),
('aze_cyrl', 'Azerbaijani - Cyrilic'),
('bel', 'Belarusian'),
('ben', 'Bengali'),
('bod', 'Tibetan'),
('bos', 'Bosnian'),
('bul', 'Bulgarian'),
('cat', 'Catalan; Valencian'),
('ceb', 'Cebuano'),
('ces', 'Czech'),
('chi_sim', 'Chinese - Simplified'),
('chi_tra', 'Chinese - Traditional'),
('chr', 'Cherokee'),
('cym', 'Welsh'),
('dan', 'Danish'),
('dan_frak', 'Danish - Fraktur'),
('deu', 'German'),
('deu_frak', 'German - Fraktur'),
('dzo', 'Dzongkha'),
('ell', 'Greek, Modern (1453-)'),
('eng', 'English'),
('enm', 'English, Middle (1100-1500)'),
('epo', 'Esperanto'),
('equ', 'Math / equation detection module'),
('est', 'Estonian'),
('eus', 'Basque'),
('fas', 'Persian'),
('fin', 'Finnish'),
('fra', 'French'),
('frk', 'Frankish'),
('frm', 'French, Middle (ca.1400-1600)'),
('gle', 'Irish'),
('glg', 'Galician'),
('grc', 'Greek, Ancient (to 1453)'),
('guj', 'Gujarati'),
('hat', 'Haitian; Haitian Creole'),
('heb', 'Hebrew'),
('hin', 'Hindi'),
('hrv', 'Croatian'),
('hun', 'Hungarian'),
('iku', 'Inuktitut'),
('ind', 'Indonesian'),
('isl', 'Icelandic'),
('ita', 'Italian'),
('ita_old', 'Italian - Old'),
('jav', 'Javanese'),
('jpn', 'Japanese'),
('kan', 'Kannada'),
('kat', 'Georgian'),
('kat_old', 'Georgian - Old'),
('kaz', 'Kazakh'),
('khm', 'Central Khmer'),
('kir', 'Kirghiz; Kyrgyz'),
('kor', 'Korean'),
('kur', 'Kurdish'),
('lao', 'Lao'),
('lat', 'Latin'),
('lav', 'Latvian'),
('lit', 'Lithuanian'),
('mal', 'Malayalam'),
('mar', 'Marathi'),
('mkd', 'Macedonian'),
('mlt', 'Maltese'),
('msa', 'Malay'),
('mya', 'Burmese'),
('nep', 'Nepali'),
('nld', 'Dutch; Flemish'),
('nor', 'Norwegian'),
('ori', 'Oriya'),
('osd', 'Orientation and script detection module'),
('pan', 'Panjabi; Punjabi'),
('pol', 'Polish'),
('por', 'Portuguese'),
('pus', 'Pushto; Pashto'),
('ron', 'Romanian; Moldavian; Moldovan'),
('rus', 'Russian'),
('san', 'Sanskrit'),
('sin', 'Sinhala; Sinhalese'),
('slk', 'Slovak'),
('slk_frak', 'Slovak - Fraktur'),
('slv', 'Slovenian'),
('spa', 'Spanish; Castilian'),
('spa_old', 'Spanish; Castilian - Old'),
('sqi', 'Albanian'),
('srp', 'Serbian'),
('srp_latn', 'Serbian - Latin'),
('swa', 'Swahili'),
('swe', 'Swedish'),
('syr', 'Syriac'),
('tam', 'Tamil'),
('tel', 'Telugu'),
('tgk', 'Tajik'),
('tgl', 'Tagalog'),
('tha', 'Thai'),
('tir', 'Tigrinya'),
('tur', 'Turkish'),
('uig', 'Uighur; Uyghur'),
('ukr', 'Ukrainian'),
('urd', 'Urdu'),
('uzb', 'Uzbek'),
('uzb_cyrl', 'Uzbek - Cyrilic'),
('vie', 'Vietnamese'),
('yid', 'Yiddish'), ]
class IrAttachment(models.Model): class IrAttachment(models.Model):
_inherit = 'ir.attachment' _inherit = 'ir.attachment'
language = fields.Selection(OCR_LANGUAGE, 'Language')
# We need to redefine index_content field to be able to update it
# on the onchange_language()
index_content = fields.Text('Indexed Content', readonly=False, prefetch=False)
index_content_rel = fields.Text(related='index_content', string='Indexed Content Rel')
@api.onchange('language')
def onchange_language(self):
process = subprocess.Popen(['tesseract', '--list-langs'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
if self.language not in stderr.split('\n'):
raise UserError(
"Language not installed."
" Please ask your system administrator to"
" install tesseract '%s' language." %
self.language)
if self.store_fname:
bin_data = self._file_read(self.store_fname)
else:
bin_data = self.db_datas
index_content = self._index(
bin_data.decode('base64'), self.datas_fname, self.mimetype)
return {'value': {
'index_content': index_content}}
@api.model @api.model
def _index(self, data, datas_fname, file_type): def _index(self, bin_data, datas_fname, mimetype):
mimetype, content = super(IrAttachment, self)._index( if not self.language:
data, datas_fname, file_type) # Set default language
self.language = self.env['ir.config_parameter'].get_param(
'document_ocr.language', 'eng')
content = super(IrAttachment, self)._index(
bin_data, datas_fname, mimetype)
if not content or content == 'image': if not content or content == 'image':
has_synchr_param = self.env['ir.config_parameter'].get_param( has_synchr_param = self.env['ir.config_parameter'].get_param(
'document_ocr.synchronous', 'False') == 'True' 'document_ocr.synchronous', 'False') == 'True'
has_force_flag = self.env.context.get('document_ocr_force') has_force_flag = self.env.context.get('document_ocr_force')
if has_synchr_param or has_force_flag: synchr = has_synchr_param or has_force_flag
content = self._index_ocr(mimetype, data, datas_fname, if synchr:
file_type) content = self._index_ocr(bin_data)
else: else:
content = _MARKER_PHRASE content = _MARKER_PHRASE
return content
return mimetype, content def _index_ocr(self, bin_data):
@api.model
def _index_ocr(self, mimetype, data, datas_fname, file_type):
dpi = int(
self.env['ir.config_parameter'].get_param(
'document_ocr.dpi', '500'))
top_type, sub_type = mimetype.split('/', 1)
if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
data, datas_fname, file_type, dpi)
else:
image_data = StringIO()
try:
Image.open(StringIO(data)).save(image_data, 'tiff',
dpi=(dpi, dpi))
except IOError:
_logger.exception('Failed to OCR image')
return None
process = subprocess.Popen( process = subprocess.Popen(
['tesseract', 'stdin', 'stdout'], ['tesseract', 'stdin', 'stdout', '-l', self.language],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
) )
stdout, stderr = process.communicate(image_data.getvalue()) stdout, stderr = process.communicate(bin_data)
if stderr: if stderr:
_logger.error('Error during OCR: %s', stderr) _logger.error('Error during OCR: %s', stderr)
return stdout return stdout
@api.model def _index_pdf(self, bin_data):
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
process = subprocess.Popen( def convert_bin_to_image(self, bin_data):
['convert', '-density', str(dpi), '-', '-append', 'png32:-'], dpi = int(self.env['ir.config_parameter'].get_param(
stdin=subprocess.PIPE, stdout=subprocess.PIPE, 'document_ocr.dpi', '500'))
stderr=subprocess.PIPE, quality = int(self.env['ir.config_parameter'].get_param(
) 'document_ocr.quality', '100'))
stdout, stderr = process.communicate(data) process = subprocess.Popen(
if stderr: ['convert', '-density', str(dpi),
_logger.error('Error converting to PDF: %s', stderr) '-quality', str(quality),
return StringIO(stdout) '-', '-append', 'png32:-'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate(bin_data)
if stderr:
_logger.error('Error converting PDF to image: %s', stderr)
return stdout
def _convert_pdf_page_to_image(self, pdf, pagenum):
dst_pdf = pyPdf.PdfFileWriter()
dst_pdf.addPage(pdf.getPage(pagenum))
pdf_bytes = io.BytesIO()
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
return convert_bin_to_image(self, pdf_bytes.read())
has_synchr_param = self.env['ir.config_parameter'].get_param(
'document_ocr.synchronous', 'False') == 'True'
has_force_flag = self.env.context.get('document_ocr_force')
synchr = has_synchr_param or has_force_flag
if synchr:
buf = super(IrAttachment, self)._index_pdf(bin_data)
if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'):
# If we got less than 2 lines, run OCR and append to existent text
try:
f = StringIO(bin_data)
pdf = pyPdf.PdfFileReader(f)
if pdf.getNumPages() > 1:
for pagenum in range(0, pdf.getNumPages()):
_logger.info('OCR PDF "%s" page %d/%d...',
self.datas_fname,
pagenum + 1,
pdf.getNumPages())
pdf_image = _convert_pdf_page_to_image(self, pdf,
pagenum)
index_content = self._index_ocr(pdf_image)
buf = u'%s\n-- %d --\n%s' % (
buf, pagenum + 1, index_content.decode('utf8'))
else:
_logger.info('OCR PDF "%s"...', self.datas_fname)
pdf_image = convert_bin_to_image(self, bin_data)
index_content = self._index_ocr(pdf_image)
buf = u'%s\n%s' % (buf, index_content.decode('utf8'))
except Exception as e:
_logger.error('Error converting PDF to image: %s', e)
pass
else:
buf = _MARKER_PHRASE
return buf
@api.model @api.model
def _ocr_cron(self): def _ocr_cron(self):
for this in self.with_context(document_ocr_force=True).search([ for this in self.with_context(document_ocr_force=True).search(
('index_content', '=', _MARKER_PHRASE), [('index_content', '=', _MARKER_PHRASE)]):
]):
if not this.datas: if not this.datas:
continue continue
file_type, index_content = this._index( index_content = this._index(
this.datas.decode('base64'), this.datas_fname, this.file_type) this.datas.decode('base64'), this.datas_fname, this.mimetype)
this.write({ this.write({
'file_type': file_type,
'index_content': index_content, 'index_content': index_content,
}) })

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# © 2017 ThinkOpen Solutions <https://tkobr.com>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import test_document_ocr from . import test_document_ocr

View File

@ -1,11 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# © 2017 ThinkOpen Solutions <https://tkobr.com>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from StringIO import StringIO from StringIO import StringIO
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE from odoo.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
from openerp.tests.common import TransactionCase from odoo.tests.common import TransactionCase
class TestDocumentOcr(TransactionCase): class TestDocumentOcr(TransactionCase):

View File

@ -0,0 +1,43 @@
<?xml version="1.0" encoding="utf-8"?>
<odoo>
<!-- Attachment -->
<record id="view_attachment_form" model="ir.ui.view">
<field name="model">ir.attachment</field>
<field name="inherit_id" ref="base.view_attachment_form"/>
<field name="arch" type="xml">
<xpath expr="(//sheet/group/group)[last()]" position="attributes">
<attribute name="invisible">1</attribute>
</xpath>
<xpath expr="(//sheet/group/group)[last()]" position="before">
<group groups="base.group_no_one" string="Indexed Content" colspan="4">
<field name="index_content_rel" readonly="1" nolabel="1"/>
</group>
</xpath>
<field name="mimetype" position="after">
<field name="store_fname" invisible="1"/>
<field name="language"/>
</field>
</field>
</record>
<record id="view_attachment_tree" model="ir.ui.view">
<field name="model">ir.attachment</field>
<field name="inherit_id" ref="base.view_attachment_tree"/>
<field name="arch" type="xml">
<field name="type" position="after">
<field name="language"/>
</field>
</field>
</record>
<record id="view_attachment_search" model="ir.ui.view">
<field name="model">ir.attachment</field>
<field name="inherit_id" ref="base.view_attachment_search"/>
<field name="arch" type="xml">
<field name="name" position="after">
<field name="language"/>
</field>
<filter name="owner" position="after">
<filter string="Language" domain="[]" context="{'group_by':'language'}" groups="base.group_no_one"/>
</filter>
</field>
</record>
</odoo>