mirror of
https://github.com/OCA/knowledge.git
synced 2025-07-20 10:16:45 -06:00
[ADD] document_ocr
This commit is contained in:
parent
46811e3e5a
commit
d4a07e88c5
@ -1,4 +1,5 @@
|
||||
sudo: false
|
||||
sudo: required
|
||||
dist: trusty
|
||||
cache: pip
|
||||
|
||||
addons:
|
||||
@ -6,6 +7,9 @@ addons:
|
||||
packages:
|
||||
- expect-dev # provides unbuffer utility
|
||||
- python-lxml # because pip installation is slow
|
||||
- tesseract-ocr # document_ocr
|
||||
- imagemagick # document_ocr
|
||||
- fonts-inconsolata # document_ocr (for tests only)
|
||||
|
||||
language: python
|
||||
|
||||
|
86
document_ocr/README.rst
Normal file
86
document_ocr/README.rst
Normal file
@ -0,0 +1,86 @@
|
||||
.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
|
||||
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
|
||||
:alt: License: AGPL-3
|
||||
|
||||
=================
|
||||
OCR for documents
|
||||
=================
|
||||
|
||||
This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
|
||||
|
||||
It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
To install this module, you need to:
|
||||
|
||||
#. install tesseract and the language(s) your documents use
|
||||
#. if you want to support OCR on PDFs, install imagemagick
|
||||
#. install the module itself
|
||||
|
||||
On an Debian or Ubuntu system you would typically run::
|
||||
|
||||
$ sudo apt-get install tesseract-ocr imagemagick
|
||||
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
To configure this module, go to:
|
||||
|
||||
#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.*
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
By default, character recognition is done asynchronously by a cronjob at night.
|
||||
This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
|
||||
The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
|
||||
In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
|
||||
|
||||
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
|
||||
:alt: Try me on Runbot
|
||||
:target: https://runbot.odoo-community.org/runbot/118/8.0
|
||||
|
||||
Bug Tracker
|
||||
===========
|
||||
|
||||
Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
|
||||
In case of trouble, please check there if your issue has already been reported.
|
||||
If you spotted it first, help us smashing it by providing a detailed and welcomed feedback.
|
||||
|
||||
Credits
|
||||
=======
|
||||
|
||||
The actual work
|
||||
---------------
|
||||
|
||||
* `tesseract <https://github.com/tesseract-ocr>`_
|
||||
|
||||
Images
|
||||
------
|
||||
|
||||
* Odoo Community Association: `Icon <https://github.com/OCA/maintainer-tools/blob/master/template/module/static/description/icon.svg>`_.
|
||||
|
||||
Contributors
|
||||
------------
|
||||
|
||||
* Holger Brunn <hbrunn@therp.nl>
|
||||
|
||||
Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list <mailto:community@mail.odoo.com>`_ or the `appropriate specialized mailinglist <https://odoo-community.org/groups>`_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues.
|
||||
|
||||
Maintainer
|
||||
----------
|
||||
|
||||
.. image:: https://odoo-community.org/logo.png
|
||||
:alt: Odoo Community Association
|
||||
:target: https://odoo-community.org
|
||||
|
||||
This module is maintained by the OCA.
|
||||
|
||||
OCA, or the Odoo Community Association, is a nonprofit organization whose
|
||||
mission is to support the collaborative development of Odoo features and
|
||||
promote its widespread use.
|
||||
|
||||
To contribute to this module, please visit https://odoo-community.org.
|
4
document_ocr/__init__.py
Normal file
4
document_ocr/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
from . import models
|
23
document_ocr/__openerp__.py
Normal file
23
document_ocr/__openerp__.py
Normal file
@ -0,0 +1,23 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
{
|
||||
"name": "OCR for documents",
|
||||
"version": "8.0.1.0.0",
|
||||
"author": "Therp BV,Odoo Community Association (OCA)",
|
||||
"license": "AGPL-3",
|
||||
"category": "Knowledge Management",
|
||||
"summary": "Run character recognition on uploaded files",
|
||||
"depends": [
|
||||
'document',
|
||||
],
|
||||
"data": [
|
||||
"data/ir_cron.xml",
|
||||
"data/ir_config_parameter.xml",
|
||||
],
|
||||
"external_dependencies": {
|
||||
'bin': [
|
||||
'tesseract',
|
||||
],
|
||||
},
|
||||
}
|
13
document_ocr/data/ir_config_parameter.xml
Normal file
13
document_ocr/data/ir_config_parameter.xml
Normal file
@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<openerp>
|
||||
<data noupdate="1">
|
||||
<record id="param_synchronous" model="ir.config_parameter">
|
||||
<field name="key">document_ocr.synchronous</field>
|
||||
<field name="value">False</field>
|
||||
</record>
|
||||
<record id="param_dpi" model="ir.config_parameter">
|
||||
<field name="key">document_ocr.dpi</field>
|
||||
<field name="value">300</field>
|
||||
</record>
|
||||
</data>
|
||||
</openerp>
|
13
document_ocr/data/ir_cron.xml
Normal file
13
document_ocr/data/ir_cron.xml
Normal file
@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<openerp>
|
||||
<data noupdate="1">
|
||||
<record id="cron" model="ir.cron">
|
||||
<field name="name">Run OCR on uploaded documents</field>
|
||||
<field name="interval_type">days</field>
|
||||
<field name="interval_number">1</field>
|
||||
<field name="model">ir.attachment</field>
|
||||
<field name="function">_ocr_cron</field>
|
||||
<field name="numbercall">-1</field>
|
||||
</record>
|
||||
</data>
|
||||
</openerp>
|
4
document_ocr/models/__init__.py
Normal file
4
document_ocr/models/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
from . import ir_attachment
|
84
document_ocr/models/ir_attachment.py
Normal file
84
document_ocr/models/ir_attachment.py
Normal file
@ -0,0 +1,84 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
import logging
|
||||
import subprocess
|
||||
from PIL import Image
|
||||
from StringIO import StringIO
|
||||
from openerp import api, models
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_MARKER_PHRASE = '[[waiting for OCR]]'
|
||||
|
||||
|
||||
class IrAttachment(models.Model):
|
||||
_inherit = 'ir.attachment'
|
||||
|
||||
@api.model
|
||||
def _index(self, data, datas_fname, file_type):
|
||||
mimetype, content = super(IrAttachment, self)._index(
|
||||
data, datas_fname, file_type)
|
||||
if not content or content == 'image':
|
||||
has_synchr_param = self.env['ir.config_parameter'].get_param(
|
||||
'document_ocr.synchronous', 'False') == 'True'
|
||||
has_force_flag = self.env.context.get('document_ocr_force')
|
||||
if has_synchr_param or has_force_flag:
|
||||
content = self._index_ocr(mimetype, data, datas_fname,
|
||||
file_type)
|
||||
else:
|
||||
content = _MARKER_PHRASE
|
||||
|
||||
return mimetype, content
|
||||
|
||||
@api.model
|
||||
def _index_ocr(self, mimetype, data, datas_fname, file_type):
|
||||
dpi = int(
|
||||
self.env['ir.config_parameter'].get_param(
|
||||
'document_ocr.dpi', '500'))
|
||||
top_type, sub_type = mimetype.split('/', 1)
|
||||
if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
|
||||
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
|
||||
data, datas_fname, file_type, dpi)
|
||||
else:
|
||||
image_data = StringIO()
|
||||
try:
|
||||
Image.open(StringIO(data)).save(image_data, 'tiff',
|
||||
dpi=(dpi, dpi))
|
||||
except IOError:
|
||||
_logger.exception('Failed to OCR image')
|
||||
return None
|
||||
process = subprocess.Popen(
|
||||
['tesseract', 'stdin', 'stdout'],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = process.communicate(image_data.getvalue())
|
||||
if stderr:
|
||||
_logger.error('Error during OCR: %s', stderr)
|
||||
return stdout
|
||||
|
||||
@api.model
|
||||
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
|
||||
process = subprocess.Popen(
|
||||
['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = process.communicate(data)
|
||||
if stderr:
|
||||
_logger.error('Error converting to PDF: %s', stderr)
|
||||
return StringIO(stdout)
|
||||
|
||||
@api.model
|
||||
def _ocr_cron(self):
|
||||
for this in self.with_context(document_ocr_force=True).search([
|
||||
('index_content', '=', _MARKER_PHRASE),
|
||||
]):
|
||||
if not this.datas:
|
||||
continue
|
||||
file_type, index_content = this._index(
|
||||
this.datas.decode('base64'), this.datas_fname, this.file_type)
|
||||
this.write({
|
||||
'file_type': file_type,
|
||||
'index_content': index_content,
|
||||
})
|
BIN
document_ocr/static/description/icon.png
Normal file
BIN
document_ocr/static/description/icon.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 9.2 KiB |
4
document_ocr/tests/__init__.py
Normal file
4
document_ocr/tests/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
from . import test_document_ocr
|
48
document_ocr/tests/test_document_ocr.py
Normal file
48
document_ocr/tests/test_document_ocr.py
Normal file
@ -0,0 +1,48 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from StringIO import StringIO
|
||||
from openerp.tests.common import TransactionCase
|
||||
from openerp.addons.document_ocr.models.ir_attachment import _MARKER_PHRASE
|
||||
|
||||
|
||||
class TestDocumentOcr(TransactionCase):
|
||||
def test_document_ocr(self):
|
||||
self.env['ir.config_parameter'].set_param(
|
||||
'document_ocr.synchronous', 'True')
|
||||
test_image = Image.new('RGB', (200, 30))
|
||||
draw = ImageDraw.Draw(test_image)
|
||||
draw.text((3, 3), "Hello world", font=ImageFont.truetype(
|
||||
'/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
|
||||
# test a plain image
|
||||
data = StringIO()
|
||||
test_image.save(data, 'png')
|
||||
result = self.env['ir.attachment']._index(
|
||||
data.getvalue(), 'test.png', None)
|
||||
self.assertEqual(result[1].strip(), 'Hello world')
|
||||
# should also work for pdfs
|
||||
data = StringIO()
|
||||
test_image.save(data, 'pdf', resolution=300)
|
||||
result = self.env['ir.attachment']._index(
|
||||
data.getvalue(), 'test.pdf', None)
|
||||
self.assertEqual(result[1].strip(), 'Hello world')
|
||||
# check cron
|
||||
self.env['ir.config_parameter'].set_param(
|
||||
'document_ocr.synchronous', 'False')
|
||||
attachment = self.env['ir.attachment'].create({
|
||||
'name': 'testattachment',
|
||||
'datas': data.getvalue().encode('base64'),
|
||||
})
|
||||
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
||||
attachment._ocr_cron()
|
||||
self.assertEqual(attachment.index_content.strip(), 'Hello world')
|
||||
# and for an unreadable image, we expect an error
|
||||
self.env['ir.config_parameter'].set_param(
|
||||
'document_ocr.synchronous', 'True')
|
||||
data = StringIO()
|
||||
test_image = Image.new('1', (200, 30))
|
||||
test_image.save(data, 'Palm')
|
||||
result = self.env['ir.attachment']._index(
|
||||
data.getvalue(), 'test.palm', None)
|
||||
self.assertEqual(result[1], None)
|
Loading…
Reference in New Issue
Block a user