mirror of
https://github.com/OCA/knowledge.git
synced 2025-07-27 19:08:42 -06:00
[IMP] document_ocr: pre-commit execution
This commit is contained in:
parent
a00735b210
commit
f1f13f1e8b
@ -1,4 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
from . import models
|
||||
|
@ -1,23 +1,22 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
{
|
||||
"name": "OCR for documents",
|
||||
"version": "8.0.1.0.0",
|
||||
"version": "16.0.1.0.0",
|
||||
"author": "Therp BV,Odoo Community Association (OCA)",
|
||||
"license": "AGPL-3",
|
||||
"category": "Knowledge Management",
|
||||
"summary": "Run character recognition on uploaded files",
|
||||
"depends": [
|
||||
'document',
|
||||
"document",
|
||||
],
|
||||
"data": [
|
||||
"data/ir_cron.xml",
|
||||
"data/ir_config_parameter.xml",
|
||||
],
|
||||
"external_dependencies": {
|
||||
'bin': [
|
||||
'tesseract',
|
||||
"bin": [
|
||||
"tesseract",
|
||||
],
|
||||
},
|
||||
}
|
@ -1,6 +1,5 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<openerp>
|
||||
<data noupdate="1">
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<odoo noupdate="1">
|
||||
<record id="param_synchronous" model="ir.config_parameter">
|
||||
<field name="key">document_ocr.synchronous</field>
|
||||
<field name="value">False</field>
|
||||
@ -9,5 +8,4 @@
|
||||
<field name="key">document_ocr.dpi</field>
|
||||
<field name="value">300</field>
|
||||
</record>
|
||||
</data>
|
||||
</openerp>
|
||||
</odoo>
|
||||
|
@ -1,6 +1,5 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<openerp>
|
||||
<data noupdate="1">
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<odoo noupdate="1">
|
||||
<record id="cron" model="ir.cron">
|
||||
<field name="name">Run OCR on uploaded documents</field>
|
||||
<field name="interval_type">days</field>
|
||||
@ -10,5 +9,4 @@
|
||||
<field name="numbercall">-1</field>
|
||||
<field name="args">(100,)</field>
|
||||
</record>
|
||||
</data>
|
||||
</openerp>
|
||||
</odoo>
|
||||
|
@ -1,4 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
from . import ir_attachment
|
||||
|
@ -1,30 +1,35 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
from PIL import Image
|
||||
from StringIO import StringIO
|
||||
from openerp import api, models
|
||||
|
||||
from odoo import api, models
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_MARKER_PHRASE = '[[waiting for OCR]]'
|
||||
_MARKER_PHRASE = "[[waiting for OCR]]"
|
||||
|
||||
|
||||
class IrAttachment(models.Model):
|
||||
_inherit = 'ir.attachment'
|
||||
_inherit = "ir.attachment"
|
||||
|
||||
@api.model
|
||||
def _index(self, data, datas_fname, file_type):
|
||||
mimetype, content = super(IrAttachment, self)._index(
|
||||
data, datas_fname, file_type)
|
||||
if data and mimetype and (not content or content == 'image'):
|
||||
has_synchr_param = self.env['ir.config_parameter'].get_param(
|
||||
'document_ocr.synchronous', 'False') == 'True'
|
||||
has_force_flag = self.env.context.get('document_ocr_force')
|
||||
data, datas_fname, file_type
|
||||
)
|
||||
if data and mimetype and (not content or content == "image"):
|
||||
has_synchr_param = (
|
||||
self.env["ir.config_parameter"].get_param(
|
||||
"document_ocr.synchronous", "False"
|
||||
)
|
||||
== "True"
|
||||
)
|
||||
has_force_flag = self.env.context.get("document_ocr_force")
|
||||
if has_synchr_param or has_force_flag:
|
||||
content = self._index_ocr(mimetype, data, datas_fname,
|
||||
file_type)
|
||||
content = self._index_ocr(mimetype, data, datas_fname, file_type)
|
||||
else:
|
||||
content = _MARKER_PHRASE
|
||||
|
||||
@ -32,56 +37,62 @@ class IrAttachment(models.Model):
|
||||
|
||||
@api.model
|
||||
def _index_ocr(self, mimetype, data, datas_fname, file_type):
|
||||
dpi = int(
|
||||
self.env['ir.config_parameter'].get_param(
|
||||
'document_ocr.dpi', '500'))
|
||||
if '/' not in mimetype:
|
||||
_logger.warning('Invalid mimetype %s', mimetype)
|
||||
dpi = int(self.env["ir.config_parameter"].get_param("document_ocr.dpi", "500"))
|
||||
if "/" not in mimetype:
|
||||
_logger.warning("Invalid mimetype %s", mimetype)
|
||||
return None
|
||||
top_type, sub_type = mimetype.split('/', 1)
|
||||
if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
|
||||
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
|
||||
data, datas_fname, file_type, dpi)
|
||||
top_type, sub_type = mimetype.split("/", 1)
|
||||
if hasattr(self, "_index_ocr_get_data_%s" % sub_type):
|
||||
image_data = getattr(self, "_index_ocr_get_data_%s" % sub_type)(
|
||||
data, datas_fname, file_type, dpi
|
||||
)
|
||||
else:
|
||||
image_data = StringIO()
|
||||
try:
|
||||
Image.open(StringIO(data)).save(image_data, 'png',
|
||||
dpi=(dpi, dpi))
|
||||
Image.open(StringIO(data)).save(image_data, "png", dpi=(dpi, dpi))
|
||||
except IOError:
|
||||
_logger.exception('Failed to OCR image')
|
||||
_logger.exception("Failed to OCR image")
|
||||
return None
|
||||
process = subprocess.Popen(
|
||||
['tesseract', 'stdin', 'stdout'],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
["tesseract", "stdin", "stdout"],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = process.communicate(image_data.getvalue())
|
||||
if process.returncode:
|
||||
_logger.error('Error during OCR: %s', stderr)
|
||||
_logger.error("Error during OCR: %s", stderr)
|
||||
return stdout
|
||||
|
||||
@api.model
|
||||
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
|
||||
process = subprocess.Popen(
|
||||
['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
["convert", "-density", str(dpi), "-", "-append", "png32:-"],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = process.communicate(data)
|
||||
if stderr:
|
||||
_logger.error('Error converting to PDF: %s', stderr)
|
||||
_logger.error("Error converting to PDF: %s", stderr)
|
||||
return StringIO(stdout)
|
||||
|
||||
@api.model
|
||||
def _ocr_cron(self, limit=0):
|
||||
for this in self.with_context(document_ocr_force=True).search([
|
||||
('index_content', '=', _MARKER_PHRASE),
|
||||
], limit=limit):
|
||||
for this in self.with_context(document_ocr_force=True).search(
|
||||
[
|
||||
("index_content", "=", _MARKER_PHRASE),
|
||||
],
|
||||
limit=limit,
|
||||
):
|
||||
if not this.datas:
|
||||
continue
|
||||
file_type, index_content = this._index(
|
||||
this.datas.decode('base64'), this.datas_fname, this.file_type)
|
||||
this.write({
|
||||
'file_type': file_type,
|
||||
'index_content': index_content,
|
||||
})
|
||||
this.datas.decode("base64"), this.datas_fname, this.file_type
|
||||
)
|
||||
this.write(
|
||||
{
|
||||
"file_type": file_type,
|
||||
"index_content": index_content,
|
||||
}
|
||||
)
|
||||
|
@ -1,4 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
from . import test_document_ocr
|
||||
|
@ -1,58 +1,65 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2016 Therp BV <http://therp.nl>
|
||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from StringIO import StringIO
|
||||
from openerp.tests.common import TransactionCase
|
||||
|
||||
from odoo.tests.common import TransactionCase
|
||||
from odoo.tools.misc import mute_logger
|
||||
|
||||
from ..models.ir_attachment import _MARKER_PHRASE
|
||||
from openerp.tools.misc import mute_logger
|
||||
|
||||
|
||||
class TestDocumentOcr(TransactionCase):
|
||||
def test_document_ocr(self):
|
||||
self.env['ir.config_parameter'].set_param(
|
||||
'document_ocr.synchronous', 'True')
|
||||
test_image = Image.new('RGB', (200, 30))
|
||||
self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "True")
|
||||
test_image = Image.new("RGB", (200, 30))
|
||||
draw = ImageDraw.Draw(test_image)
|
||||
draw.text((3, 3), "Hello world", font=ImageFont.truetype(
|
||||
'/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
|
||||
draw.text(
|
||||
(3, 3),
|
||||
"Hello world",
|
||||
font=ImageFont.truetype(
|
||||
"/usr/share/fonts/truetype/inconsolata/Inconsolata.otf", 24
|
||||
),
|
||||
)
|
||||
# test a plain image
|
||||
data = StringIO()
|
||||
test_image.save(data, 'png')
|
||||
result = self.env['ir.attachment']._index(
|
||||
data.getvalue(), 'test.png', None)
|
||||
self.assertEqual(result[1].strip(), 'Hello world')
|
||||
test_image.save(data, "png")
|
||||
result = self.env["ir.attachment"]._index(data.getvalue(), "test.png", None)
|
||||
self.assertEqual(result[1].strip(), "Hello world")
|
||||
# should also work for pdfs if supported, protect against
|
||||
# ancient pillows
|
||||
if hasattr(Image, 'registered_extensions') and\
|
||||
'PDF' in Image.registered_extensions().values():
|
||||
if (
|
||||
hasattr(Image, "registered_extensions")
|
||||
and "PDF" in Image.registered_extensions().values()
|
||||
):
|
||||
data = StringIO()
|
||||
test_image.save(data, 'pdf', resolution=300)
|
||||
result = self.env['ir.attachment']._index(
|
||||
data.getvalue(), 'test.pdf', None)
|
||||
self.assertEqual(result[1].strip(), 'Hello world')
|
||||
test_image.save(data, "pdf", resolution=300)
|
||||
result = self.env["ir.attachment"]._index(data.getvalue(), "test.pdf", None)
|
||||
self.assertEqual(result[1].strip(), "Hello world")
|
||||
# check cron
|
||||
self.env['ir.config_parameter'].set_param(
|
||||
'document_ocr.synchronous', 'False')
|
||||
attachment = self.env['ir.attachment'].create({
|
||||
'name': 'testattachment',
|
||||
'datas': data.getvalue().encode('base64'),
|
||||
})
|
||||
self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "False")
|
||||
attachment = self.env["ir.attachment"].create(
|
||||
{
|
||||
"name": "testattachment",
|
||||
"datas": data.getvalue().encode("base64"),
|
||||
}
|
||||
)
|
||||
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
||||
attachment._ocr_cron()
|
||||
self.assertEqual(attachment.index_content.strip(), 'Hello world')
|
||||
self.assertEqual(attachment.index_content.strip(), "Hello world")
|
||||
# and for an unreadable image, we expect an error
|
||||
if hasattr(Image, 'registered_extensions') and\
|
||||
'PALM' in Image.registered_extensions().values():
|
||||
self.env['ir.config_parameter'].set_param(
|
||||
'document_ocr.synchronous', 'True')
|
||||
if (
|
||||
hasattr(Image, "registered_extensions")
|
||||
and "PALM" in Image.registered_extensions().values()
|
||||
):
|
||||
self.env["ir.config_parameter"].set_param(
|
||||
"document_ocr.synchronous", "True"
|
||||
)
|
||||
data = StringIO()
|
||||
test_image = Image.new('1', (200, 30))
|
||||
test_image.save(data, 'Palm')
|
||||
with mute_logger(
|
||||
'openerp.addons.document_ocr.models.ir_attachment'
|
||||
):
|
||||
result = self.env['ir.attachment']._index(
|
||||
data.getvalue(), 'test.palm', None
|
||||
test_image = Image.new("1", (200, 30))
|
||||
test_image.save(data, "Palm")
|
||||
with mute_logger("openerp.addons.document_ocr.models.ir_attachment"):
|
||||
result = self.env["ir.attachment"]._index(
|
||||
data.getvalue(), "test.palm", None
|
||||
)
|
||||
self.assertEqual(result[1], None)
|
||||
|
1
setup/document_ocr/odoo/addons/document_ocr
Symbolic link
1
setup/document_ocr/odoo/addons/document_ocr
Symbolic link
@ -0,0 +1 @@
|
||||
../../../../document_ocr
|
6
setup/document_ocr/setup.py
Normal file
6
setup/document_ocr/setup.py
Normal file
@ -0,0 +1,6 @@
|
||||
import setuptools
|
||||
|
||||
setuptools.setup(
|
||||
setup_requires=['setuptools-odoo'],
|
||||
odoo_addon=True,
|
||||
)
|
Loading…
Reference in New Issue
Block a user