diff --git a/document_ocr/README.rst b/document_ocr/README.rst index 7f9c3b28..c500f0b1 100644 --- a/document_ocr/README.rst +++ b/document_ocr/README.rst @@ -39,6 +39,21 @@ This is because the recognition process takes a while and you don't want to make The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. + +By default, recognition language is set to english. +In case you want to use a different default, set configuration parameter ``document_ocr.language`` to value respective value ex:``por``, for Portuguese. + + +In PDF case, OCR will run after it will be converted to an image. But OCR will be applied to all PDFs. + + +System parameters used: +#``document_ocr.synchronous``: bool +#``document_ocr.language``: string +#``document_ocr.dpi``: integer +#``document_ocr.quality``: integer + + .. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas :alt: Try me on Runbot :target: https://runbot.odoo-community.org/runbot/118/10.0 diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py index 7eda98a2..472456b6 100644 --- a/document_ocr/__init__.py +++ b/document_ocr/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import models diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py index 382e77d6..39d783d1 100644 --- a/document_ocr/__manifest__.py +++ b/document_ocr/__manifest__.py @@ -2,9 +2,9 @@ # © 2016 Therp BV # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). { - "name": "OCR for documents", + "name": "OCR for Documents", "version": "10.0.1.0.0", - "author": "Therp BV,Odoo Community Association (OCA), TKO Brasil", + "author": "Therp BV, Odoo Community Association (OCA), ThinkOpen Solutions Brasil", "license": "AGPL-3", "category": "Knowledge Management", "summary": "Run character recognition on uploaded files", @@ -14,10 +14,12 @@ "data": [ "data/ir_cron.xml", "data/ir_config_parameter.xml", + "views/ir_attachment_view.xml", ], "external_dependencies": { 'bin': [ 'tesseract', + 'convert', ], }, } diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml index e46db18a..721a0740 100644 --- a/document_ocr/data/ir_config_parameter.xml +++ b/document_ocr/data/ir_config_parameter.xml @@ -9,5 +9,13 @@ document_ocr.dpi 300 + + document_ocr.quality + 100 + + + document_ocr.language + eng + diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py index a15f1b21..051b3ddf 100644 --- a/document_ocr/models/__init__.py +++ b/document_ocr/models/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import ir_attachment diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py index b27992c8..f28e1fc9 100644 --- a/document_ocr/models/ir_attachment.py +++ b/document_ocr/models/ir_attachment.py @@ -1,85 +1,256 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +import io import logging import subprocess from StringIO import StringIO -from PIL import Image -from openerp import api, models +import pyPdf +from odoo import api, fields, models +from odoo.exceptions import UserError _logger = logging.getLogger(__name__) _MARKER_PHRASE = '[[waiting for OCR]]' +OCR_LANGUAGE = [('afr', 'Afrikaans'), + ('amh', 'Amharic'), + ('ara', 'Arabic'), + ('asm', 'Assamese'), + ('aze', 'Azerbaijani'), + ('aze_cyrl', 'Azerbaijani - Cyrilic'), + ('bel', 'Belarusian'), + ('ben', 'Bengali'), + ('bod', 'Tibetan'), + ('bos', 'Bosnian'), + ('bul', 'Bulgarian'), + ('cat', 'Catalan; Valencian'), + ('ceb', 'Cebuano'), + ('ces', 'Czech'), + ('chi_sim', 'Chinese - Simplified'), + ('chi_tra', 'Chinese - Traditional'), + ('chr', 'Cherokee'), + ('cym', 'Welsh'), + ('dan', 'Danish'), + ('dan_frak', 'Danish - Fraktur'), + ('deu', 'German'), + ('deu_frak', 'German - Fraktur'), + ('dzo', 'Dzongkha'), + ('ell', 'Greek, Modern (1453-)'), + ('eng', 'English'), + ('enm', 'English, Middle (1100-1500)'), + ('epo', 'Esperanto'), + ('equ', 'Math / equation detection module'), + ('est', 'Estonian'), + ('eus', 'Basque'), + ('fas', 'Persian'), + ('fin', 'Finnish'), + ('fra', 'French'), + ('frk', 'Frankish'), + ('frm', 'French, Middle (ca.1400-1600)'), + ('gle', 'Irish'), + ('glg', 'Galician'), + ('grc', 'Greek, Ancient (to 1453)'), + ('guj', 'Gujarati'), + ('hat', 'Haitian; Haitian Creole'), + ('heb', 'Hebrew'), + ('hin', 'Hindi'), + ('hrv', 'Croatian'), + ('hun', 'Hungarian'), + ('iku', 'Inuktitut'), + ('ind', 'Indonesian'), + ('isl', 'Icelandic'), + ('ita', 'Italian'), + ('ita_old', 'Italian - Old'), + ('jav', 'Javanese'), + ('jpn', 'Japanese'), + ('kan', 'Kannada'), + ('kat', 'Georgian'), + ('kat_old', 'Georgian - Old'), + ('kaz', 'Kazakh'), + ('khm', 'Central Khmer'), + ('kir', 'Kirghiz; Kyrgyz'), + ('kor', 'Korean'), + ('kur', 'Kurdish'), + ('lao', 'Lao'), + ('lat', 'Latin'), + ('lav', 'Latvian'), + ('lit', 'Lithuanian'), + ('mal', 'Malayalam'), + ('mar', 'Marathi'), + ('mkd', 'Macedonian'), + ('mlt', 'Maltese'), + ('msa', 'Malay'), + ('mya', 'Burmese'), + ('nep', 'Nepali'), + ('nld', 'Dutch; Flemish'), + ('nor', 'Norwegian'), + ('ori', 'Oriya'), + ('osd', 'Orientation and script detection module'), + ('pan', 'Panjabi; Punjabi'), + ('pol', 'Polish'), + ('por', 'Portuguese'), + ('pus', 'Pushto; Pashto'), + ('ron', 'Romanian; Moldavian; Moldovan'), + ('rus', 'Russian'), + ('san', 'Sanskrit'), + ('sin', 'Sinhala; Sinhalese'), + ('slk', 'Slovak'), + ('slk_frak', 'Slovak - Fraktur'), + ('slv', 'Slovenian'), + ('spa', 'Spanish; Castilian'), + ('spa_old', 'Spanish; Castilian - Old'), + ('sqi', 'Albanian'), + ('srp', 'Serbian'), + ('srp_latn', 'Serbian - Latin'), + ('swa', 'Swahili'), + ('swe', 'Swedish'), + ('syr', 'Syriac'), + ('tam', 'Tamil'), + ('tel', 'Telugu'), + ('tgk', 'Tajik'), + ('tgl', 'Tagalog'), + ('tha', 'Thai'), + ('tir', 'Tigrinya'), + ('tur', 'Turkish'), + ('uig', 'Uighur; Uyghur'), + ('ukr', 'Ukrainian'), + ('urd', 'Urdu'), + ('uzb', 'Uzbek'), + ('uzb_cyrl', 'Uzbek - Cyrilic'), + ('vie', 'Vietnamese'), + ('yid', 'Yiddish'), ] class IrAttachment(models.Model): _inherit = 'ir.attachment' + language = fields.Selection(OCR_LANGUAGE, 'Language') + # We need to redefine index_content field to be able to update it + # on the onchange_language() + index_content = fields.Text('Indexed Content', readonly=False, prefetch=False) + index_content_rel = fields.Text(related='index_content', string='Indexed Content Rel') + + @api.onchange('language') + def onchange_language(self): + process = subprocess.Popen(['tesseract', '--list-langs'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + if self.language not in stderr.split('\n'): + raise UserError( + "Language not installed." + " Please ask your system administrator to" + " install tesseract '%s' language." % + self.language) + if self.store_fname: + bin_data = self._file_read(self.store_fname) + else: + bin_data = self.db_datas + index_content = self._index( + bin_data.decode('base64'), self.datas_fname, self.mimetype) + return {'value': { + 'index_content': index_content}} + @api.model - def _index(self, data, datas_fname, file_type): - mimetype, content = super(IrAttachment, self)._index( - data, datas_fname, file_type) + def _index(self, bin_data, datas_fname, mimetype): + if not self.language: + # Set default language + self.language = self.env['ir.config_parameter'].get_param( + 'document_ocr.language', 'eng') + content = super(IrAttachment, self)._index( + bin_data, datas_fname, mimetype) if not content or content == 'image': has_synchr_param = self.env['ir.config_parameter'].get_param( 'document_ocr.synchronous', 'False') == 'True' has_force_flag = self.env.context.get('document_ocr_force') - if has_synchr_param or has_force_flag: - content = self._index_ocr(mimetype, data, datas_fname, - file_type) + synchr = has_synchr_param or has_force_flag + if synchr: + content = self._index_ocr(bin_data) else: content = _MARKER_PHRASE + return content - return mimetype, content - - @api.model - def _index_ocr(self, mimetype, data, datas_fname, file_type): - dpi = int( - self.env['ir.config_parameter'].get_param( - 'document_ocr.dpi', '500')) - top_type, sub_type = mimetype.split('/', 1) - if hasattr(self, '_index_ocr_get_data_%s' % sub_type): - image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)( - data, datas_fname, file_type, dpi) - else: - image_data = StringIO() - try: - Image.open(StringIO(data)).save(image_data, 'tiff', - dpi=(dpi, dpi)) - except IOError: - _logger.exception('Failed to OCR image') - return None + def _index_ocr(self, bin_data): process = subprocess.Popen( - ['tesseract', 'stdin', 'stdout'], + ['tesseract', 'stdin', 'stdout', '-l', self.language], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - stdout, stderr = process.communicate(image_data.getvalue()) + stdout, stderr = process.communicate(bin_data) if stderr: _logger.error('Error during OCR: %s', stderr) return stdout - @api.model - def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi): - process = subprocess.Popen( - ['convert', '-density', str(dpi), '-', '-append', 'png32:-'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate(data) - if stderr: - _logger.error('Error converting to PDF: %s', stderr) - return StringIO(stdout) + def _index_pdf(self, bin_data): + + def convert_bin_to_image(self, bin_data): + dpi = int(self.env['ir.config_parameter'].get_param( + 'document_ocr.dpi', '500')) + quality = int(self.env['ir.config_parameter'].get_param( + 'document_ocr.quality', '100')) + process = subprocess.Popen( + ['convert', '-density', str(dpi), + '-quality', str(quality), + '-', '-append', 'png32:-'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = process.communicate(bin_data) + if stderr: + _logger.error('Error converting PDF to image: %s', stderr) + return stdout + + def _convert_pdf_page_to_image(self, pdf, pagenum): + dst_pdf = pyPdf.PdfFileWriter() + dst_pdf.addPage(pdf.getPage(pagenum)) + pdf_bytes = io.BytesIO() + dst_pdf.write(pdf_bytes) + pdf_bytes.seek(0) + return convert_bin_to_image(self, pdf_bytes.read()) + + has_synchr_param = self.env['ir.config_parameter'].get_param( + 'document_ocr.synchronous', 'False') == 'True' + has_force_flag = self.env.context.get('document_ocr_force') + synchr = has_synchr_param or has_force_flag + if synchr: + buf = super(IrAttachment, self)._index_pdf(bin_data) + if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'): + # If we got less than 2 lines, run OCR and append to existent text + try: + f = StringIO(bin_data) + pdf = pyPdf.PdfFileReader(f) + if pdf.getNumPages() > 1: + for pagenum in range(0, pdf.getNumPages()): + _logger.info('OCR PDF "%s" page %d/%d...', + self.datas_fname, + pagenum + 1, + pdf.getNumPages()) + pdf_image = _convert_pdf_page_to_image(self, pdf, + pagenum) + index_content = self._index_ocr(pdf_image) + buf = u'%s\n-- %d --\n%s' % ( + buf, pagenum + 1, index_content.decode('utf8')) + else: + _logger.info('OCR PDF "%s"...', self.datas_fname) + pdf_image = convert_bin_to_image(self, bin_data) + index_content = self._index_ocr(pdf_image) + buf = u'%s\n%s' % (buf, index_content.decode('utf8')) + except Exception as e: + _logger.error('Error converting PDF to image: %s', e) + pass + else: + buf = _MARKER_PHRASE + return buf @api.model def _ocr_cron(self): - for this in self.with_context(document_ocr_force=True).search([ - ('index_content', '=', _MARKER_PHRASE), - ]): + for this in self.with_context(document_ocr_force=True).search( + [('index_content', '=', _MARKER_PHRASE)]): if not this.datas: continue - file_type, index_content = this._index( - this.datas.decode('base64'), this.datas_fname, this.file_type) + index_content = this._index( + this.datas.decode('base64'), this.datas_fname, this.mimetype) this.write({ - 'file_type': file_type, 'index_content': index_content, }) diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py index 7bdf742c..7efb2857 100644 --- a/document_ocr/tests/__init__.py +++ b/document_ocr/tests/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import test_document_ocr diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index b1695da8..e54a6ac0 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- # © 2016 Therp BV +# © 2017 ThinkOpen Solutions # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from StringIO import StringIO from PIL import Image, ImageDraw, ImageFont -from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE -from openerp.tests.common import TransactionCase +from odoo.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE +from odoo.tests.common import TransactionCase class TestDocumentOcr(TransactionCase): diff --git a/document_ocr/views/ir_attachment_view.xml b/document_ocr/views/ir_attachment_view.xml new file mode 100644 index 00000000..ed171d61 --- /dev/null +++ b/document_ocr/views/ir_attachment_view.xml @@ -0,0 +1,43 @@ + + + + + ir.attachment + + + + 1 + + + + + + + + + + + + + + ir.attachment + + + + + + + + + ir.attachment + + + + + + + + + + +