[ADD] document_ocr

2025-07-20 10:16:45 -06:00 · 2016-06-21 18:30:19 +02:00 · 2016-06-21 18:30:19 +02:00 · d4a07e88c5
commit d4a07e88c5
parent 46811e3e5a
11 changed files with 284 additions and 1 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,4 +1,5 @@
-sudo: false
+sudo: required
+dist: trusty
 cache: pip

 addons:
@ -6,6 +7,9 @@ addons:
    packages:
      - expect-dev  # provides unbuffer utility
      - python-lxml # because pip installation is slow
+      - tesseract-ocr # document_ocr
+      - imagemagick # document_ocr
+      - fonts-inconsolata # document_ocr (for tests only)

 language: python

--- a/document_ocr/README.rst
+++ b/document_ocr/README.rst
@ -0,0 +1,86 @@
+.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
+    :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
+    :alt: License: AGPL-3
+
+=================
+OCR for documents
+=================
+
+This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
+
+It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
+
+Installation
+============
+
+To install this module, you need to:
+
+#. install tesseract and the language(s) your documents use
+#. if you want to support OCR on PDFs, install imagemagick
+#. install the module itself
+
+On an Debian or Ubuntu system you would typically run::
+
+    $ sudo apt-get install tesseract-ocr imagemagick
+
+
+Configuration
+=============
+
+To configure this module, go to:
+
+#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.*
+
+Usage
+=====
+
+By default, character recognition is done asynchronously by a cronjob at night. 
+This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
+The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
+In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
+
+.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
+    :alt: Try me on Runbot
+    :target: https://runbot.odoo-community.org/runbot/118/8.0
+
+Bug Tracker
+===========
+
+Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
+In case of trouble, please check there if your issue has already been reported.
+If you spotted it first, help us smashing it by providing a detailed and welcomed feedback.
+
+Credits
+=======
+
+The actual work
+---------------
+
+* `tesseract <https://github.com/tesseract-ocr>`_
+
+Images
+------
+
+* Odoo Community Association: `Icon <https://github.com/OCA/maintainer-tools/blob/master/template/module/static/description/icon.svg>`_.
+
+Contributors
+------------
+
+* Holger Brunn <hbrunn@therp.nl>  
+
+Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list <mailto:community@mail.odoo.com>`_ or the `appropriate specialized mailinglist <https://odoo-community.org/groups>`_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues.
+
+Maintainer
+----------
+
+.. image:: https://odoo-community.org/logo.png
+   :alt: Odoo Community Association
+   :target: https://odoo-community.org
+
+This module is maintained by the OCA.
+
+OCA, or the Odoo Community Association, is a nonprofit organization whose
+mission is to support the collaborative development of Odoo features and
+promote its widespread use.
+
+To contribute to this module, please visit https://odoo-community.org.
--- a/document_ocr/init.py
+++ b/document_ocr/init.py
@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import models
--- a/document_ocr/openerp.py
+++ b/document_ocr/openerp.py
@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+{
+    "name": "OCR for documents",
+    "version": "8.0.1.0.0",
+    "author": "Therp BV,Odoo Community Association (OCA)",
+    "license": "AGPL-3",
+    "category": "Knowledge Management",
+    "summary": "Run character recognition on uploaded files",
+    "depends": [
+        'document',
+    ],
+    "data": [
+        "data/ir_cron.xml",
+        "data/ir_config_parameter.xml",
+    ],
+    "external_dependencies": {
+        'bin': [
+            'tesseract',
+        ],
+    },
+}
--- a/document_ocr/data/ir_config_parameter.xml
+++ b/document_ocr/data/ir_config_parameter.xml
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<openerp>
+    <data noupdate="1">
+        <record id="param_synchronous" model="ir.config_parameter">
+            <field name="key">document_ocr.synchronous</field>
+            <field name="value">False</field>
+        </record>
+        <record id="param_dpi" model="ir.config_parameter">
+            <field name="key">document_ocr.dpi</field>
+            <field name="value">300</field>
+        </record>
+    </data>
+</openerp>
--- a/document_ocr/data/ir_cron.xml
+++ b/document_ocr/data/ir_cron.xml
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<openerp>
+    <data noupdate="1">
+        <record id="cron" model="ir.cron">
+            <field name="name">Run OCR on uploaded documents</field>
+            <field name="interval_type">days</field>
+            <field name="interval_number">1</field>
+            <field name="model">ir.attachment</field>
+            <field name="function">_ocr_cron</field>
+            <field name="numbercall">-1</field>
+        </record>
+    </data>
+</openerp>
--- a/document_ocr/models/init.py
+++ b/document_ocr/models/init.py
@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import ir_attachment
--- a/document_ocr/models/ir_attachment.py
+++ b/document_ocr/models/ir_attachment.py
@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+import logging
+import subprocess
+from PIL import Image
+from StringIO import StringIO
+from openerp import api, models
+
+_logger = logging.getLogger(__name__)
+_MARKER_PHRASE = '[[waiting for OCR]]'
+
+
+class IrAttachment(models.Model):
+    _inherit = 'ir.attachment'
+
+    @api.model
+    def _index(self, data, datas_fname, file_type):
+        mimetype, content = super(IrAttachment, self)._index(
+            data, datas_fname, file_type)
+        if not content or content == 'image':
+            has_synchr_param = self.env['ir.config_parameter'].get_param(
+                'document_ocr.synchronous', 'False') == 'True'
+            has_force_flag = self.env.context.get('document_ocr_force')
+            if has_synchr_param or has_force_flag:
+                content = self._index_ocr(mimetype, data, datas_fname,
+                                          file_type)
+            else:
+                content = _MARKER_PHRASE
+
+        return mimetype, content
+
+    @api.model
+    def _index_ocr(self, mimetype, data, datas_fname, file_type):
+        dpi = int(
+            self.env['ir.config_parameter'].get_param(
+                'document_ocr.dpi', '500'))
+        top_type, sub_type = mimetype.split('/', 1)
+        if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
+            image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
+                data, datas_fname, file_type, dpi)
+        else:
+            image_data = StringIO()
+            try:
+                Image.open(StringIO(data)).save(image_data, 'tiff',
+                                                dpi=(dpi, dpi))
+            except IOError:
+                _logger.exception('Failed to OCR image')
+                return None
+        process = subprocess.Popen(
+            ['tesseract', 'stdin', 'stdout'],
+            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = process.communicate(image_data.getvalue())
+        if stderr:
+            _logger.error('Error during OCR: %s', stderr)
+        return stdout
+
+    @api.model
+    def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
+        process = subprocess.Popen(
+            ['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
+            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = process.communicate(data)
+        if stderr:
+            _logger.error('Error converting to PDF: %s', stderr)
+        return StringIO(stdout)
+
+    @api.model
+    def _ocr_cron(self):
+        for this in self.with_context(document_ocr_force=True).search([
+            ('index_content', '=', _MARKER_PHRASE),
+        ]):
+            if not this.datas:
+                continue
+            file_type, index_content = this._index(
+                this.datas.decode('base64'), this.datas_fname, this.file_type)
+            this.write({
+                'file_type': file_type,
+                'index_content': index_content,
+            })
--- a/document_ocr/static/description/icon.png
+++ b/document_ocr/static/description/icon.png
--- a/document_ocr/tests/init.py
+++ b/document_ocr/tests/init.py
@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import test_document_ocr
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from PIL import Image, ImageDraw, ImageFont
+from StringIO import StringIO
+from openerp.tests.common import TransactionCase
+from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
+
+
+class TestDocumentOcr(TransactionCase):
+    def test_document_ocr(self):
+        self.env['ir.config_parameter'].set_param(
+            'document_ocr.synchronous', 'True')
+        test_image = Image.new('RGB', (200, 30))
+        draw = ImageDraw.Draw(test_image)
+        draw.text((3, 3), "Hello world", font=ImageFont.truetype(
+            '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
+        # test a plain image
+        data = StringIO()
+        test_image.save(data, 'png')
+        result = self.env['ir.attachment']._index(
+            data.getvalue(), 'test.png', None)
+        self.assertEqual(result[1].strip(), 'Hello world')
+        # should also work for pdfs
+        data = StringIO()
+        test_image.save(data, 'pdf', resolution=300)
+        result = self.env['ir.attachment']._index(
+            data.getvalue(), 'test.pdf', None)
+        self.assertEqual(result[1].strip(), 'Hello world')
+        # check cron
+        self.env['ir.config_parameter'].set_param(
+            'document_ocr.synchronous', 'False')
+        attachment = self.env['ir.attachment'].create({
+            'name': 'testattachment',
+            'datas': data.getvalue().encode('base64'),
+        })
+        self.assertEqual(attachment.index_content, _MARKER_PHRASE)
+        attachment._ocr_cron()
+        self.assertEqual(attachment.index_content.strip(), 'Hello world')
+        # and for an unreadable image, we expect an error
+        self.env['ir.config_parameter'].set_param(
+            'document_ocr.synchronous', 'True')
+        data = StringIO()
+        test_image = Image.new('1', (200, 30))
+        test_image.save(data, 'Palm')
+        result = self.env['ir.attachment']._index(
+            data.getvalue(), 'test.palm', None)
+        self.assertEqual(result[1], None)