mirror of
https://github.com/OCA/knowledge.git
synced 2025-07-28 03:16:29 -06:00
[IMP] document_ocr: pre-commit execution
This commit is contained in:
parent
a00735b210
commit
f1f13f1e8b
@ -1,4 +1,3 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# © 2016 Therp BV <http://therp.nl>
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
from . import models
|
from . import models
|
||||||
|
@ -1,23 +1,22 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# © 2016 Therp BV <http://therp.nl>
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
{
|
{
|
||||||
"name": "OCR for documents",
|
"name": "OCR for documents",
|
||||||
"version": "8.0.1.0.0",
|
"version": "16.0.1.0.0",
|
||||||
"author": "Therp BV,Odoo Community Association (OCA)",
|
"author": "Therp BV,Odoo Community Association (OCA)",
|
||||||
"license": "AGPL-3",
|
"license": "AGPL-3",
|
||||||
"category": "Knowledge Management",
|
"category": "Knowledge Management",
|
||||||
"summary": "Run character recognition on uploaded files",
|
"summary": "Run character recognition on uploaded files",
|
||||||
"depends": [
|
"depends": [
|
||||||
'document',
|
"document",
|
||||||
],
|
],
|
||||||
"data": [
|
"data": [
|
||||||
"data/ir_cron.xml",
|
"data/ir_cron.xml",
|
||||||
"data/ir_config_parameter.xml",
|
"data/ir_config_parameter.xml",
|
||||||
],
|
],
|
||||||
"external_dependencies": {
|
"external_dependencies": {
|
||||||
'bin': [
|
"bin": [
|
||||||
'tesseract',
|
"tesseract",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}
|
}
|
@ -1,6 +1,5 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
<openerp>
|
<odoo noupdate="1">
|
||||||
<data noupdate="1">
|
|
||||||
<record id="param_synchronous" model="ir.config_parameter">
|
<record id="param_synchronous" model="ir.config_parameter">
|
||||||
<field name="key">document_ocr.synchronous</field>
|
<field name="key">document_ocr.synchronous</field>
|
||||||
<field name="value">False</field>
|
<field name="value">False</field>
|
||||||
@ -9,5 +8,4 @@
|
|||||||
<field name="key">document_ocr.dpi</field>
|
<field name="key">document_ocr.dpi</field>
|
||||||
<field name="value">300</field>
|
<field name="value">300</field>
|
||||||
</record>
|
</record>
|
||||||
</data>
|
</odoo>
|
||||||
</openerp>
|
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
<openerp>
|
<odoo noupdate="1">
|
||||||
<data noupdate="1">
|
|
||||||
<record id="cron" model="ir.cron">
|
<record id="cron" model="ir.cron">
|
||||||
<field name="name">Run OCR on uploaded documents</field>
|
<field name="name">Run OCR on uploaded documents</field>
|
||||||
<field name="interval_type">days</field>
|
<field name="interval_type">days</field>
|
||||||
@ -10,5 +9,4 @@
|
|||||||
<field name="numbercall">-1</field>
|
<field name="numbercall">-1</field>
|
||||||
<field name="args">(100,)</field>
|
<field name="args">(100,)</field>
|
||||||
</record>
|
</record>
|
||||||
</data>
|
</odoo>
|
||||||
</openerp>
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# © 2016 Therp BV <http://therp.nl>
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
from . import ir_attachment
|
from . import ir_attachment
|
||||||
|
@ -1,30 +1,35 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# © 2016 Therp BV <http://therp.nl>
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from openerp import api, models
|
|
||||||
|
from odoo import api, models
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
_logger = logging.getLogger(__name__)
|
||||||
_MARKER_PHRASE = '[[waiting for OCR]]'
|
_MARKER_PHRASE = "[[waiting for OCR]]"
|
||||||
|
|
||||||
|
|
||||||
class IrAttachment(models.Model):
|
class IrAttachment(models.Model):
|
||||||
_inherit = 'ir.attachment'
|
_inherit = "ir.attachment"
|
||||||
|
|
||||||
@api.model
|
@api.model
|
||||||
def _index(self, data, datas_fname, file_type):
|
def _index(self, data, datas_fname, file_type):
|
||||||
mimetype, content = super(IrAttachment, self)._index(
|
mimetype, content = super(IrAttachment, self)._index(
|
||||||
data, datas_fname, file_type)
|
data, datas_fname, file_type
|
||||||
if data and mimetype and (not content or content == 'image'):
|
)
|
||||||
has_synchr_param = self.env['ir.config_parameter'].get_param(
|
if data and mimetype and (not content or content == "image"):
|
||||||
'document_ocr.synchronous', 'False') == 'True'
|
has_synchr_param = (
|
||||||
has_force_flag = self.env.context.get('document_ocr_force')
|
self.env["ir.config_parameter"].get_param(
|
||||||
|
"document_ocr.synchronous", "False"
|
||||||
|
)
|
||||||
|
== "True"
|
||||||
|
)
|
||||||
|
has_force_flag = self.env.context.get("document_ocr_force")
|
||||||
if has_synchr_param or has_force_flag:
|
if has_synchr_param or has_force_flag:
|
||||||
content = self._index_ocr(mimetype, data, datas_fname,
|
content = self._index_ocr(mimetype, data, datas_fname, file_type)
|
||||||
file_type)
|
|
||||||
else:
|
else:
|
||||||
content = _MARKER_PHRASE
|
content = _MARKER_PHRASE
|
||||||
|
|
||||||
@ -32,56 +37,62 @@ class IrAttachment(models.Model):
|
|||||||
|
|
||||||
@api.model
|
@api.model
|
||||||
def _index_ocr(self, mimetype, data, datas_fname, file_type):
|
def _index_ocr(self, mimetype, data, datas_fname, file_type):
|
||||||
dpi = int(
|
dpi = int(self.env["ir.config_parameter"].get_param("document_ocr.dpi", "500"))
|
||||||
self.env['ir.config_parameter'].get_param(
|
if "/" not in mimetype:
|
||||||
'document_ocr.dpi', '500'))
|
_logger.warning("Invalid mimetype %s", mimetype)
|
||||||
if '/' not in mimetype:
|
|
||||||
_logger.warning('Invalid mimetype %s', mimetype)
|
|
||||||
return None
|
return None
|
||||||
top_type, sub_type = mimetype.split('/', 1)
|
top_type, sub_type = mimetype.split("/", 1)
|
||||||
if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
|
if hasattr(self, "_index_ocr_get_data_%s" % sub_type):
|
||||||
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
|
image_data = getattr(self, "_index_ocr_get_data_%s" % sub_type)(
|
||||||
data, datas_fname, file_type, dpi)
|
data, datas_fname, file_type, dpi
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
image_data = StringIO()
|
image_data = StringIO()
|
||||||
try:
|
try:
|
||||||
Image.open(StringIO(data)).save(image_data, 'png',
|
Image.open(StringIO(data)).save(image_data, "png", dpi=(dpi, dpi))
|
||||||
dpi=(dpi, dpi))
|
|
||||||
except IOError:
|
except IOError:
|
||||||
_logger.exception('Failed to OCR image')
|
_logger.exception("Failed to OCR image")
|
||||||
return None
|
return None
|
||||||
process = subprocess.Popen(
|
process = subprocess.Popen(
|
||||||
['tesseract', 'stdin', 'stdout'],
|
["tesseract", "stdin", "stdout"],
|
||||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
)
|
)
|
||||||
stdout, stderr = process.communicate(image_data.getvalue())
|
stdout, stderr = process.communicate(image_data.getvalue())
|
||||||
if process.returncode:
|
if process.returncode:
|
||||||
_logger.error('Error during OCR: %s', stderr)
|
_logger.error("Error during OCR: %s", stderr)
|
||||||
return stdout
|
return stdout
|
||||||
|
|
||||||
@api.model
|
@api.model
|
||||||
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
|
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
|
||||||
process = subprocess.Popen(
|
process = subprocess.Popen(
|
||||||
['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
|
["convert", "-density", str(dpi), "-", "-append", "png32:-"],
|
||||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
)
|
)
|
||||||
stdout, stderr = process.communicate(data)
|
stdout, stderr = process.communicate(data)
|
||||||
if stderr:
|
if stderr:
|
||||||
_logger.error('Error converting to PDF: %s', stderr)
|
_logger.error("Error converting to PDF: %s", stderr)
|
||||||
return StringIO(stdout)
|
return StringIO(stdout)
|
||||||
|
|
||||||
@api.model
|
@api.model
|
||||||
def _ocr_cron(self, limit=0):
|
def _ocr_cron(self, limit=0):
|
||||||
for this in self.with_context(document_ocr_force=True).search([
|
for this in self.with_context(document_ocr_force=True).search(
|
||||||
('index_content', '=', _MARKER_PHRASE),
|
[
|
||||||
], limit=limit):
|
("index_content", "=", _MARKER_PHRASE),
|
||||||
|
],
|
||||||
|
limit=limit,
|
||||||
|
):
|
||||||
if not this.datas:
|
if not this.datas:
|
||||||
continue
|
continue
|
||||||
file_type, index_content = this._index(
|
file_type, index_content = this._index(
|
||||||
this.datas.decode('base64'), this.datas_fname, this.file_type)
|
this.datas.decode("base64"), this.datas_fname, this.file_type
|
||||||
this.write({
|
)
|
||||||
'file_type': file_type,
|
this.write(
|
||||||
'index_content': index_content,
|
{
|
||||||
})
|
"file_type": file_type,
|
||||||
|
"index_content": index_content,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# © 2016 Therp BV <http://therp.nl>
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
from . import test_document_ocr
|
from . import test_document_ocr
|
||||||
|
@ -1,58 +1,65 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# © 2016 Therp BV <http://therp.nl>
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from openerp.tests.common import TransactionCase
|
|
||||||
|
from odoo.tests.common import TransactionCase
|
||||||
|
from odoo.tools.misc import mute_logger
|
||||||
|
|
||||||
from ..models.ir_attachment import _MARKER_PHRASE
|
from ..models.ir_attachment import _MARKER_PHRASE
|
||||||
from openerp.tools.misc import mute_logger
|
|
||||||
|
|
||||||
|
|
||||||
class TestDocumentOcr(TransactionCase):
|
class TestDocumentOcr(TransactionCase):
|
||||||
def test_document_ocr(self):
|
def test_document_ocr(self):
|
||||||
self.env['ir.config_parameter'].set_param(
|
self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "True")
|
||||||
'document_ocr.synchronous', 'True')
|
test_image = Image.new("RGB", (200, 30))
|
||||||
test_image = Image.new('RGB', (200, 30))
|
|
||||||
draw = ImageDraw.Draw(test_image)
|
draw = ImageDraw.Draw(test_image)
|
||||||
draw.text((3, 3), "Hello world", font=ImageFont.truetype(
|
draw.text(
|
||||||
'/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
|
(3, 3),
|
||||||
|
"Hello world",
|
||||||
|
font=ImageFont.truetype(
|
||||||
|
"/usr/share/fonts/truetype/inconsolata/Inconsolata.otf", 24
|
||||||
|
),
|
||||||
|
)
|
||||||
# test a plain image
|
# test a plain image
|
||||||
data = StringIO()
|
data = StringIO()
|
||||||
test_image.save(data, 'png')
|
test_image.save(data, "png")
|
||||||
result = self.env['ir.attachment']._index(
|
result = self.env["ir.attachment"]._index(data.getvalue(), "test.png", None)
|
||||||
data.getvalue(), 'test.png', None)
|
self.assertEqual(result[1].strip(), "Hello world")
|
||||||
self.assertEqual(result[1].strip(), 'Hello world')
|
|
||||||
# should also work for pdfs if supported, protect against
|
# should also work for pdfs if supported, protect against
|
||||||
# ancient pillows
|
# ancient pillows
|
||||||
if hasattr(Image, 'registered_extensions') and\
|
if (
|
||||||
'PDF' in Image.registered_extensions().values():
|
hasattr(Image, "registered_extensions")
|
||||||
|
and "PDF" in Image.registered_extensions().values()
|
||||||
|
):
|
||||||
data = StringIO()
|
data = StringIO()
|
||||||
test_image.save(data, 'pdf', resolution=300)
|
test_image.save(data, "pdf", resolution=300)
|
||||||
result = self.env['ir.attachment']._index(
|
result = self.env["ir.attachment"]._index(data.getvalue(), "test.pdf", None)
|
||||||
data.getvalue(), 'test.pdf', None)
|
self.assertEqual(result[1].strip(), "Hello world")
|
||||||
self.assertEqual(result[1].strip(), 'Hello world')
|
|
||||||
# check cron
|
# check cron
|
||||||
self.env['ir.config_parameter'].set_param(
|
self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "False")
|
||||||
'document_ocr.synchronous', 'False')
|
attachment = self.env["ir.attachment"].create(
|
||||||
attachment = self.env['ir.attachment'].create({
|
{
|
||||||
'name': 'testattachment',
|
"name": "testattachment",
|
||||||
'datas': data.getvalue().encode('base64'),
|
"datas": data.getvalue().encode("base64"),
|
||||||
})
|
}
|
||||||
|
)
|
||||||
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
||||||
attachment._ocr_cron()
|
attachment._ocr_cron()
|
||||||
self.assertEqual(attachment.index_content.strip(), 'Hello world')
|
self.assertEqual(attachment.index_content.strip(), "Hello world")
|
||||||
# and for an unreadable image, we expect an error
|
# and for an unreadable image, we expect an error
|
||||||
if hasattr(Image, 'registered_extensions') and\
|
if (
|
||||||
'PALM' in Image.registered_extensions().values():
|
hasattr(Image, "registered_extensions")
|
||||||
self.env['ir.config_parameter'].set_param(
|
and "PALM" in Image.registered_extensions().values()
|
||||||
'document_ocr.synchronous', 'True')
|
|
||||||
data = StringIO()
|
|
||||||
test_image = Image.new('1', (200, 30))
|
|
||||||
test_image.save(data, 'Palm')
|
|
||||||
with mute_logger(
|
|
||||||
'openerp.addons.document_ocr.models.ir_attachment'
|
|
||||||
):
|
):
|
||||||
result = self.env['ir.attachment']._index(
|
self.env["ir.config_parameter"].set_param(
|
||||||
data.getvalue(), 'test.palm', None
|
"document_ocr.synchronous", "True"
|
||||||
|
)
|
||||||
|
data = StringIO()
|
||||||
|
test_image = Image.new("1", (200, 30))
|
||||||
|
test_image.save(data, "Palm")
|
||||||
|
with mute_logger("openerp.addons.document_ocr.models.ir_attachment"):
|
||||||
|
result = self.env["ir.attachment"]._index(
|
||||||
|
data.getvalue(), "test.palm", None
|
||||||
)
|
)
|
||||||
self.assertEqual(result[1], None)
|
self.assertEqual(result[1], None)
|
||||||
|
1
setup/document_ocr/odoo/addons/document_ocr
Symbolic link
1
setup/document_ocr/odoo/addons/document_ocr
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../../document_ocr
|
6
setup/document_ocr/setup.py
Normal file
6
setup/document_ocr/setup.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import setuptools
|
||||||
|
|
||||||
|
setuptools.setup(
|
||||||
|
setup_requires=['setuptools-odoo'],
|
||||||
|
odoo_addon=True,
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user