diff --git a/.travis.yml b/.travis.yml index da6f1dbc..e42da917 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,5 @@ -sudo: false +sudo: required +dist: trusty cache: pip addons: @@ -6,6 +7,9 @@ addons: packages: - expect-dev # provides unbuffer utility - python-lxml # because pip installation is slow + - tesseract-ocr # document_ocr + - imagemagick # document_ocr + - fonts-inconsolata # document_ocr (for tests only) language: python diff --git a/document_ocr/README.rst b/document_ocr/README.rst new file mode 100644 index 00000000..c4d667f8 --- /dev/null +++ b/document_ocr/README.rst @@ -0,0 +1,86 @@ +.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg + :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html + :alt: License: AGPL-3 + +================= +OCR for documents +================= + +This module was written to make uploaded documents, for example scans, searchable by running OCR on them. + +It supports all image formats `Pillow supports `_ for reading and PDFs. + +Installation +============ + +To install this module, you need to: + +#. install tesseract and the language(s) your documents use +#. if you want to support OCR on PDFs, install imagemagick +#. install the module itself + +On an Debian or Ubuntu system you would typically run:: + + $ sudo apt-get install tesseract-ocr imagemagick + + +Configuration +============= + +To configure this module, go to: + +#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.* + +Usage +===== + +By default, character recognition is done asynchronously by a cronjob at night. +This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish. +The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. +In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. + +.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas + :alt: Try me on Runbot + :target: https://runbot.odoo-community.org/runbot/118/8.0 + +Bug Tracker +=========== + +Bugs are tracked on `GitHub Issues `_. +In case of trouble, please check there if your issue has already been reported. +If you spotted it first, help us smashing it by providing a detailed and welcomed feedback. + +Credits +======= + +The actual work +--------------- + +* `tesseract `_ + +Images +------ + +* Odoo Community Association: `Icon `_. + +Contributors +------------ + +* Holger Brunn + +Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list `_ or the `appropriate specialized mailinglist `_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues. + +Maintainer +---------- + +.. image:: https://odoo-community.org/logo.png + :alt: Odoo Community Association + :target: https://odoo-community.org + +This module is maintained by the OCA. + +OCA, or the Odoo Community Association, is a nonprofit organization whose +mission is to support the collaborative development of Odoo features and +promote its widespread use. + +To contribute to this module, please visit https://odoo-community.org. diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py new file mode 100644 index 00000000..7eda98a2 --- /dev/null +++ b/document_ocr/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import models diff --git a/document_ocr/__openerp__.py b/document_ocr/__openerp__.py new file mode 100644 index 00000000..2c1a2696 --- /dev/null +++ b/document_ocr/__openerp__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +{ + "name": "OCR for documents", + "version": "8.0.1.0.0", + "author": "Therp BV,Odoo Community Association (OCA)", + "license": "AGPL-3", + "category": "Knowledge Management", + "summary": "Run character recognition on uploaded files", + "depends": [ + 'document', + ], + "data": [ + "data/ir_cron.xml", + "data/ir_config_parameter.xml", + ], + "external_dependencies": { + 'bin': [ + 'tesseract', + ], + }, +} diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml new file mode 100644 index 00000000..e46db18a --- /dev/null +++ b/document_ocr/data/ir_config_parameter.xml @@ -0,0 +1,13 @@ + + + + + document_ocr.synchronous + False + + + document_ocr.dpi + 300 + + + diff --git a/document_ocr/data/ir_cron.xml b/document_ocr/data/ir_cron.xml new file mode 100644 index 00000000..f69d151a --- /dev/null +++ b/document_ocr/data/ir_cron.xml @@ -0,0 +1,13 @@ + + + + + Run OCR on uploaded documents + days + 1 + ir.attachment + _ocr_cron + -1 + + + diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py new file mode 100644 index 00000000..a15f1b21 --- /dev/null +++ b/document_ocr/models/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import ir_attachment diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py new file mode 100644 index 00000000..ec161712 --- /dev/null +++ b/document_ocr/models/ir_attachment.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +import logging +import subprocess +from PIL import Image +from StringIO import StringIO +from openerp import api, models + +_logger = logging.getLogger(__name__) +_MARKER_PHRASE = '[[waiting for OCR]]' + + +class IrAttachment(models.Model): + _inherit = 'ir.attachment' + + @api.model + def _index(self, data, datas_fname, file_type): + mimetype, content = super(IrAttachment, self)._index( + data, datas_fname, file_type) + if not content or content == 'image': + has_synchr_param = self.env['ir.config_parameter'].get_param( + 'document_ocr.synchronous', 'False') == 'True' + has_force_flag = self.env.context.get('document_ocr_force') + if has_synchr_param or has_force_flag: + content = self._index_ocr(mimetype, data, datas_fname, + file_type) + else: + content = _MARKER_PHRASE + + return mimetype, content + + @api.model + def _index_ocr(self, mimetype, data, datas_fname, file_type): + dpi = int( + self.env['ir.config_parameter'].get_param( + 'document_ocr.dpi', '500')) + top_type, sub_type = mimetype.split('/', 1) + if hasattr(self, '_index_ocr_get_data_%s' % sub_type): + image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)( + data, datas_fname, file_type, dpi) + else: + image_data = StringIO() + try: + Image.open(StringIO(data)).save(image_data, 'tiff', + dpi=(dpi, dpi)) + except IOError: + _logger.exception('Failed to OCR image') + return None + process = subprocess.Popen( + ['tesseract', 'stdin', 'stdout'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(image_data.getvalue()) + if stderr: + _logger.error('Error during OCR: %s', stderr) + return stdout + + @api.model + def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi): + process = subprocess.Popen( + ['convert', '-density', str(dpi), '-', '-append', 'png32:-'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(data) + if stderr: + _logger.error('Error converting to PDF: %s', stderr) + return StringIO(stdout) + + @api.model + def _ocr_cron(self): + for this in self.with_context(document_ocr_force=True).search([ + ('index_content', '=', _MARKER_PHRASE), + ]): + if not this.datas: + continue + file_type, index_content = this._index( + this.datas.decode('base64'), this.datas_fname, this.file_type) + this.write({ + 'file_type': file_type, + 'index_content': index_content, + }) diff --git a/document_ocr/static/description/icon.png b/document_ocr/static/description/icon.png new file mode 100644 index 00000000..3a0328b5 Binary files /dev/null and b/document_ocr/static/description/icon.png differ diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py new file mode 100644 index 00000000..7bdf742c --- /dev/null +++ b/document_ocr/tests/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import test_document_ocr diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py new file mode 100644 index 00000000..7dccb672 --- /dev/null +++ b/document_ocr/tests/test_document_ocr.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from PIL import Image, ImageDraw, ImageFont +from StringIO import StringIO +from openerp.tests.common import TransactionCase +from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE + + +class TestDocumentOcr(TransactionCase): + def test_document_ocr(self): + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'True') + test_image = Image.new('RGB', (200, 30)) + draw = ImageDraw.Draw(test_image) + draw.text((3, 3), "Hello world", font=ImageFont.truetype( + '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24)) + # test a plain image + data = StringIO() + test_image.save(data, 'png') + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.png', None) + self.assertEqual(result[1].strip(), 'Hello world') + # should also work for pdfs + data = StringIO() + test_image.save(data, 'pdf', resolution=300) + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.pdf', None) + self.assertEqual(result[1].strip(), 'Hello world') + # check cron + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'False') + attachment = self.env['ir.attachment'].create({ + 'name': 'testattachment', + 'datas': data.getvalue().encode('base64'), + }) + self.assertEqual(attachment.index_content, _MARKER_PHRASE) + attachment._ocr_cron() + self.assertEqual(attachment.index_content.strip(), 'Hello world') + # and for an unreadable image, we expect an error + self.env['ir.config_parameter'].set_param( + 'document_ocr.synchronous', 'True') + data = StringIO() + test_image = Image.new('1', (200, 30)) + test_image.save(data, 'Palm') + result = self.env['ir.attachment']._index( + data.getvalue(), 'test.palm', None) + self.assertEqual(result[1], None)