mirror of
https://github.com/OCA/knowledge.git
synced 2025-07-20 18:26:45 -06:00
[ADD] document_ocr
This commit is contained in:
parent
46811e3e5a
commit
d4a07e88c5
@ -1,4 +1,5 @@
|
|||||||
sudo: false
|
sudo: required
|
||||||
|
dist: trusty
|
||||||
cache: pip
|
cache: pip
|
||||||
|
|
||||||
addons:
|
addons:
|
||||||
@ -6,6 +7,9 @@ addons:
|
|||||||
packages:
|
packages:
|
||||||
- expect-dev # provides unbuffer utility
|
- expect-dev # provides unbuffer utility
|
||||||
- python-lxml # because pip installation is slow
|
- python-lxml # because pip installation is slow
|
||||||
|
- tesseract-ocr # document_ocr
|
||||||
|
- imagemagick # document_ocr
|
||||||
|
- fonts-inconsolata # document_ocr (for tests only)
|
||||||
|
|
||||||
language: python
|
language: python
|
||||||
|
|
||||||
|
86
document_ocr/README.rst
Normal file
86
document_ocr/README.rst
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
|
||||||
|
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
|
||||||
|
:alt: License: AGPL-3
|
||||||
|
|
||||||
|
=================
|
||||||
|
OCR for documents
|
||||||
|
=================
|
||||||
|
|
||||||
|
This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
|
||||||
|
|
||||||
|
It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
|
||||||
|
|
||||||
|
Installation
|
||||||
|
============
|
||||||
|
|
||||||
|
To install this module, you need to:
|
||||||
|
|
||||||
|
#. install tesseract and the language(s) your documents use
|
||||||
|
#. if you want to support OCR on PDFs, install imagemagick
|
||||||
|
#. install the module itself
|
||||||
|
|
||||||
|
On an Debian or Ubuntu system you would typically run::
|
||||||
|
|
||||||
|
$ sudo apt-get install tesseract-ocr imagemagick
|
||||||
|
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
To configure this module, go to:
|
||||||
|
|
||||||
|
#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.*
|
||||||
|
|
||||||
|
Usage
|
||||||
|
=====
|
||||||
|
|
||||||
|
By default, character recognition is done asynchronously by a cronjob at night.
|
||||||
|
This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
|
||||||
|
The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
|
||||||
|
In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
|
||||||
|
|
||||||
|
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
|
||||||
|
:alt: Try me on Runbot
|
||||||
|
:target: https://runbot.odoo-community.org/runbot/118/8.0
|
||||||
|
|
||||||
|
Bug Tracker
|
||||||
|
===========
|
||||||
|
|
||||||
|
Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
|
||||||
|
In case of trouble, please check there if your issue has already been reported.
|
||||||
|
If you spotted it first, help us smashing it by providing a detailed and welcomed feedback.
|
||||||
|
|
||||||
|
Credits
|
||||||
|
=======
|
||||||
|
|
||||||
|
The actual work
|
||||||
|
---------------
|
||||||
|
|
||||||
|
* `tesseract <https://github.com/tesseract-ocr>`_
|
||||||
|
|
||||||
|
Images
|
||||||
|
------
|
||||||
|
|
||||||
|
* Odoo Community Association: `Icon <https://github.com/OCA/maintainer-tools/blob/master/template/module/static/description/icon.svg>`_.
|
||||||
|
|
||||||
|
Contributors
|
||||||
|
------------
|
||||||
|
|
||||||
|
* Holger Brunn <hbrunn@therp.nl>
|
||||||
|
|
||||||
|
Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list <mailto:community@mail.odoo.com>`_ or the `appropriate specialized mailinglist <https://odoo-community.org/groups>`_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues.
|
||||||
|
|
||||||
|
Maintainer
|
||||||
|
----------
|
||||||
|
|
||||||
|
.. image:: https://odoo-community.org/logo.png
|
||||||
|
:alt: Odoo Community Association
|
||||||
|
:target: https://odoo-community.org
|
||||||
|
|
||||||
|
This module is maintained by the OCA.
|
||||||
|
|
||||||
|
OCA, or the Odoo Community Association, is a nonprofit organization whose
|
||||||
|
mission is to support the collaborative development of Odoo features and
|
||||||
|
promote its widespread use.
|
||||||
|
|
||||||
|
To contribute to this module, please visit https://odoo-community.org.
|
4
document_ocr/__init__.py
Normal file
4
document_ocr/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
|
from . import models
|
23
document_ocr/__openerp__.py
Normal file
23
document_ocr/__openerp__.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
|
{
|
||||||
|
"name": "OCR for documents",
|
||||||
|
"version": "8.0.1.0.0",
|
||||||
|
"author": "Therp BV,Odoo Community Association (OCA)",
|
||||||
|
"license": "AGPL-3",
|
||||||
|
"category": "Knowledge Management",
|
||||||
|
"summary": "Run character recognition on uploaded files",
|
||||||
|
"depends": [
|
||||||
|
'document',
|
||||||
|
],
|
||||||
|
"data": [
|
||||||
|
"data/ir_cron.xml",
|
||||||
|
"data/ir_config_parameter.xml",
|
||||||
|
],
|
||||||
|
"external_dependencies": {
|
||||||
|
'bin': [
|
||||||
|
'tesseract',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
13
document_ocr/data/ir_config_parameter.xml
Normal file
13
document_ocr/data/ir_config_parameter.xml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<openerp>
|
||||||
|
<data noupdate="1">
|
||||||
|
<record id="param_synchronous" model="ir.config_parameter">
|
||||||
|
<field name="key">document_ocr.synchronous</field>
|
||||||
|
<field name="value">False</field>
|
||||||
|
</record>
|
||||||
|
<record id="param_dpi" model="ir.config_parameter">
|
||||||
|
<field name="key">document_ocr.dpi</field>
|
||||||
|
<field name="value">300</field>
|
||||||
|
</record>
|
||||||
|
</data>
|
||||||
|
</openerp>
|
13
document_ocr/data/ir_cron.xml
Normal file
13
document_ocr/data/ir_cron.xml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<openerp>
|
||||||
|
<data noupdate="1">
|
||||||
|
<record id="cron" model="ir.cron">
|
||||||
|
<field name="name">Run OCR on uploaded documents</field>
|
||||||
|
<field name="interval_type">days</field>
|
||||||
|
<field name="interval_number">1</field>
|
||||||
|
<field name="model">ir.attachment</field>
|
||||||
|
<field name="function">_ocr_cron</field>
|
||||||
|
<field name="numbercall">-1</field>
|
||||||
|
</record>
|
||||||
|
</data>
|
||||||
|
</openerp>
|
4
document_ocr/models/__init__.py
Normal file
4
document_ocr/models/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
|
from . import ir_attachment
|
84
document_ocr/models/ir_attachment.py
Normal file
84
document_ocr/models/ir_attachment.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
from PIL import Image
|
||||||
|
from StringIO import StringIO
|
||||||
|
from openerp import api, models
|
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__)
|
||||||
|
_MARKER_PHRASE = '[[waiting for OCR]]'
|
||||||
|
|
||||||
|
|
||||||
|
class IrAttachment(models.Model):
|
||||||
|
_inherit = 'ir.attachment'
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _index(self, data, datas_fname, file_type):
|
||||||
|
mimetype, content = super(IrAttachment, self)._index(
|
||||||
|
data, datas_fname, file_type)
|
||||||
|
if not content or content == 'image':
|
||||||
|
has_synchr_param = self.env['ir.config_parameter'].get_param(
|
||||||
|
'document_ocr.synchronous', 'False') == 'True'
|
||||||
|
has_force_flag = self.env.context.get('document_ocr_force')
|
||||||
|
if has_synchr_param or has_force_flag:
|
||||||
|
content = self._index_ocr(mimetype, data, datas_fname,
|
||||||
|
file_type)
|
||||||
|
else:
|
||||||
|
content = _MARKER_PHRASE
|
||||||
|
|
||||||
|
return mimetype, content
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _index_ocr(self, mimetype, data, datas_fname, file_type):
|
||||||
|
dpi = int(
|
||||||
|
self.env['ir.config_parameter'].get_param(
|
||||||
|
'document_ocr.dpi', '500'))
|
||||||
|
top_type, sub_type = mimetype.split('/', 1)
|
||||||
|
if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
|
||||||
|
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
|
||||||
|
data, datas_fname, file_type, dpi)
|
||||||
|
else:
|
||||||
|
image_data = StringIO()
|
||||||
|
try:
|
||||||
|
Image.open(StringIO(data)).save(image_data, 'tiff',
|
||||||
|
dpi=(dpi, dpi))
|
||||||
|
except IOError:
|
||||||
|
_logger.exception('Failed to OCR image')
|
||||||
|
return None
|
||||||
|
process = subprocess.Popen(
|
||||||
|
['tesseract', 'stdin', 'stdout'],
|
||||||
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, stderr = process.communicate(image_data.getvalue())
|
||||||
|
if stderr:
|
||||||
|
_logger.error('Error during OCR: %s', stderr)
|
||||||
|
return stdout
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
|
||||||
|
process = subprocess.Popen(
|
||||||
|
['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
|
||||||
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, stderr = process.communicate(data)
|
||||||
|
if stderr:
|
||||||
|
_logger.error('Error converting to PDF: %s', stderr)
|
||||||
|
return StringIO(stdout)
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _ocr_cron(self):
|
||||||
|
for this in self.with_context(document_ocr_force=True).search([
|
||||||
|
('index_content', '=', _MARKER_PHRASE),
|
||||||
|
]):
|
||||||
|
if not this.datas:
|
||||||
|
continue
|
||||||
|
file_type, index_content = this._index(
|
||||||
|
this.datas.decode('base64'), this.datas_fname, this.file_type)
|
||||||
|
this.write({
|
||||||
|
'file_type': file_type,
|
||||||
|
'index_content': index_content,
|
||||||
|
})
|
BIN
document_ocr/static/description/icon.png
Normal file
BIN
document_ocr/static/description/icon.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 9.2 KiB |
4
document_ocr/tests/__init__.py
Normal file
4
document_ocr/tests/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
|
from . import test_document_ocr
|
48
document_ocr/tests/test_document_ocr.py
Normal file
48
document_ocr/tests/test_document_ocr.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
from StringIO import StringIO
|
||||||
|
from openerp.tests.common import TransactionCase
|
||||||
|
from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocumentOcr(TransactionCase):
|
||||||
|
def test_document_ocr(self):
|
||||||
|
self.env['ir.config_parameter'].set_param(
|
||||||
|
'document_ocr.synchronous', 'True')
|
||||||
|
test_image = Image.new('RGB', (200, 30))
|
||||||
|
draw = ImageDraw.Draw(test_image)
|
||||||
|
draw.text((3, 3), "Hello world", font=ImageFont.truetype(
|
||||||
|
'/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
|
||||||
|
# test a plain image
|
||||||
|
data = StringIO()
|
||||||
|
test_image.save(data, 'png')
|
||||||
|
result = self.env['ir.attachment']._index(
|
||||||
|
data.getvalue(), 'test.png', None)
|
||||||
|
self.assertEqual(result[1].strip(), 'Hello world')
|
||||||
|
# should also work for pdfs
|
||||||
|
data = StringIO()
|
||||||
|
test_image.save(data, 'pdf', resolution=300)
|
||||||
|
result = self.env['ir.attachment']._index(
|
||||||
|
data.getvalue(), 'test.pdf', None)
|
||||||
|
self.assertEqual(result[1].strip(), 'Hello world')
|
||||||
|
# check cron
|
||||||
|
self.env['ir.config_parameter'].set_param(
|
||||||
|
'document_ocr.synchronous', 'False')
|
||||||
|
attachment = self.env['ir.attachment'].create({
|
||||||
|
'name': 'testattachment',
|
||||||
|
'datas': data.getvalue().encode('base64'),
|
||||||
|
})
|
||||||
|
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
||||||
|
attachment._ocr_cron()
|
||||||
|
self.assertEqual(attachment.index_content.strip(), 'Hello world')
|
||||||
|
# and for an unreadable image, we expect an error
|
||||||
|
self.env['ir.config_parameter'].set_param(
|
||||||
|
'document_ocr.synchronous', 'True')
|
||||||
|
data = StringIO()
|
||||||
|
test_image = Image.new('1', (200, 30))
|
||||||
|
test_image.save(data, 'Palm')
|
||||||
|
result = self.env['ir.attachment']._index(
|
||||||
|
data.getvalue(), 'test.palm', None)
|
||||||
|
self.assertEqual(result[1], None)
|
Loading…
Reference in New Issue
Block a user