[ADD] document_ocr

This commit is contained in:
Holger Brunn 2016-06-21 18:30:19 +02:00
parent 46811e3e5a
commit d4a07e88c5
11 changed files with 284 additions and 1 deletions

View File

@ -1,4 +1,5 @@
sudo: false
sudo: required
dist: trusty
cache: pip
addons:
@ -6,6 +7,9 @@ addons:
packages:
- expect-dev # provides unbuffer utility
- python-lxml # because pip installation is slow
- tesseract-ocr # document_ocr
- imagemagick # document_ocr
- fonts-inconsolata # document_ocr (for tests only)
language: python

86
document_ocr/README.rst Normal file
View File

@ -0,0 +1,86 @@
.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
:alt: License: AGPL-3
=================
OCR for documents
=================
This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
Installation
============
To install this module, you need to:
#. install tesseract and the language(s) your documents use
#. if you want to support OCR on PDFs, install imagemagick
#. install the module itself
On an Debian or Ubuntu system you would typically run::
$ sudo apt-get install tesseract-ocr imagemagick
Configuration
=============
To configure this module, go to:
#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.*
Usage
=====
By default, character recognition is done asynchronously by a cronjob at night.
This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
:alt: Try me on Runbot
:target: https://runbot.odoo-community.org/runbot/118/8.0
Bug Tracker
===========
Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
In case of trouble, please check there if your issue has already been reported.
If you spotted it first, help us smashing it by providing a detailed and welcomed feedback.
Credits
=======
The actual work
---------------
* `tesseract <https://github.com/tesseract-ocr>`_
Images
------
* Odoo Community Association: `Icon <https://github.com/OCA/maintainer-tools/blob/master/template/module/static/description/icon.svg>`_.
Contributors
------------
* Holger Brunn <hbrunn@therp.nl>
Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list <mailto:community@mail.odoo.com>`_ or the `appropriate specialized mailinglist <https://odoo-community.org/groups>`_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues.
Maintainer
----------
.. image:: https://odoo-community.org/logo.png
:alt: Odoo Community Association
:target: https://odoo-community.org
This module is maintained by the OCA.
OCA, or the Odoo Community Association, is a nonprofit organization whose
mission is to support the collaborative development of Odoo features and
promote its widespread use.
To contribute to this module, please visit https://odoo-community.org.

4
document_ocr/__init__.py Normal file
View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import models

View File

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{
"name": "OCR for documents",
"version": "8.0.1.0.0",
"author": "Therp BV,Odoo Community Association (OCA)",
"license": "AGPL-3",
"category": "Knowledge Management",
"summary": "Run character recognition on uploaded files",
"depends": [
'document',
],
"data": [
"data/ir_cron.xml",
"data/ir_config_parameter.xml",
],
"external_dependencies": {
'bin': [
'tesseract',
],
},
}

View File

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<openerp>
<data noupdate="1">
<record id="param_synchronous" model="ir.config_parameter">
<field name="key">document_ocr.synchronous</field>
<field name="value">False</field>
</record>
<record id="param_dpi" model="ir.config_parameter">
<field name="key">document_ocr.dpi</field>
<field name="value">300</field>
</record>
</data>
</openerp>

View File

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<openerp>
<data noupdate="1">
<record id="cron" model="ir.cron">
<field name="name">Run OCR on uploaded documents</field>
<field name="interval_type">days</field>
<field name="interval_number">1</field>
<field name="model">ir.attachment</field>
<field name="function">_ocr_cron</field>
<field name="numbercall">-1</field>
</record>
</data>
</openerp>

View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import ir_attachment

View File

@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import logging
import subprocess
from PIL import Image
from StringIO import StringIO
from openerp import api, models
_logger = logging.getLogger(__name__)
_MARKER_PHRASE = '[[waiting for OCR]]'
class IrAttachment(models.Model):
_inherit = 'ir.attachment'
@api.model
def _index(self, data, datas_fname, file_type):
mimetype, content = super(IrAttachment, self)._index(
data, datas_fname, file_type)
if not content or content == 'image':
has_synchr_param = self.env['ir.config_parameter'].get_param(
'document_ocr.synchronous', 'False') == 'True'
has_force_flag = self.env.context.get('document_ocr_force')
if has_synchr_param or has_force_flag:
content = self._index_ocr(mimetype, data, datas_fname,
file_type)
else:
content = _MARKER_PHRASE
return mimetype, content
@api.model
def _index_ocr(self, mimetype, data, datas_fname, file_type):
dpi = int(
self.env['ir.config_parameter'].get_param(
'document_ocr.dpi', '500'))
top_type, sub_type = mimetype.split('/', 1)
if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
data, datas_fname, file_type, dpi)
else:
image_data = StringIO()
try:
Image.open(StringIO(data)).save(image_data, 'tiff',
dpi=(dpi, dpi))
except IOError:
_logger.exception('Failed to OCR image')
return None
process = subprocess.Popen(
['tesseract', 'stdin', 'stdout'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate(image_data.getvalue())
if stderr:
_logger.error('Error during OCR: %s', stderr)
return stdout
@api.model
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
process = subprocess.Popen(
['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate(data)
if stderr:
_logger.error('Error converting to PDF: %s', stderr)
return StringIO(stdout)
@api.model
def _ocr_cron(self):
for this in self.with_context(document_ocr_force=True).search([
('index_content', '=', _MARKER_PHRASE),
]):
if not this.datas:
continue
file_type, index_content = this._index(
this.datas.decode('base64'), this.datas_fname, this.file_type)
this.write({
'file_type': file_type,
'index_content': index_content,
})

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.2 KiB

View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import test_document_ocr

View File

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from PIL import Image, ImageDraw, ImageFont
from StringIO import StringIO
from openerp.tests.common import TransactionCase
from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
class TestDocumentOcr(TransactionCase):
def test_document_ocr(self):
self.env['ir.config_parameter'].set_param(
'document_ocr.synchronous', 'True')
test_image = Image.new('RGB', (200, 30))
draw = ImageDraw.Draw(test_image)
draw.text((3, 3), "Hello world", font=ImageFont.truetype(
'/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
# test a plain image
data = StringIO()
test_image.save(data, 'png')
result = self.env['ir.attachment']._index(
data.getvalue(), 'test.png', None)
self.assertEqual(result[1].strip(), 'Hello world')
# should also work for pdfs
data = StringIO()
test_image.save(data, 'pdf', resolution=300)
result = self.env['ir.attachment']._index(
data.getvalue(), 'test.pdf', None)
self.assertEqual(result[1].strip(), 'Hello world')
# check cron
self.env['ir.config_parameter'].set_param(
'document_ocr.synchronous', 'False')
attachment = self.env['ir.attachment'].create({
'name': 'testattachment',
'datas': data.getvalue().encode('base64'),
})
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
attachment._ocr_cron()
self.assertEqual(attachment.index_content.strip(), 'Hello world')
# and for an unreadable image, we expect an error
self.env['ir.config_parameter'].set_param(
'document_ocr.synchronous', 'True')
data = StringIO()
test_image = Image.new('1', (200, 30))
test_image.save(data, 'Palm')
result = self.env['ir.attachment']._index(
data.getvalue(), 'test.palm', None)
self.assertEqual(result[1], None)