[IMP] document_ocr: pre-commit execution

This commit is contained in:
len 2023-09-07 11:09:24 +02:00
parent a00735b210
commit f1f13f1e8b
10 changed files with 108 additions and 91 deletions

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import models

View File

@ -1,23 +1,22 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{
"name": "OCR for documents",
"version": "8.0.1.0.0",
"version": "16.0.1.0.0",
"author": "Therp BV,Odoo Community Association (OCA)",
"license": "AGPL-3",
"category": "Knowledge Management",
"summary": "Run character recognition on uploaded files",
"depends": [
'document',
"document",
],
"data": [
"data/ir_cron.xml",
"data/ir_config_parameter.xml",
],
"external_dependencies": {
'bin': [
'tesseract',
"bin": [
"tesseract",
],
},
}

View File

@ -1,6 +1,5 @@
<?xml version="1.0" encoding="UTF-8" ?>
<openerp>
<data noupdate="1">
<odoo noupdate="1">
<record id="param_synchronous" model="ir.config_parameter">
<field name="key">document_ocr.synchronous</field>
<field name="value">False</field>
@ -9,5 +8,4 @@
<field name="key">document_ocr.dpi</field>
<field name="value">300</field>
</record>
</data>
</openerp>
</odoo>

View File

@ -1,6 +1,5 @@
<?xml version="1.0" encoding="UTF-8" ?>
<openerp>
<data noupdate="1">
<odoo noupdate="1">
<record id="cron" model="ir.cron">
<field name="name">Run OCR on uploaded documents</field>
<field name="interval_type">days</field>
@ -10,5 +9,4 @@
<field name="numbercall">-1</field>
<field name="args">(100,)</field>
</record>
</data>
</openerp>
</odoo>

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import ir_attachment

View File

@ -1,30 +1,35 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import logging
import subprocess
from PIL import Image
from StringIO import StringIO
from openerp import api, models
from odoo import api, models
_logger = logging.getLogger(__name__)
_MARKER_PHRASE = '[[waiting for OCR]]'
_MARKER_PHRASE = "[[waiting for OCR]]"
class IrAttachment(models.Model):
_inherit = 'ir.attachment'
_inherit = "ir.attachment"
@api.model
def _index(self, data, datas_fname, file_type):
mimetype, content = super(IrAttachment, self)._index(
data, datas_fname, file_type)
if data and mimetype and (not content or content == 'image'):
has_synchr_param = self.env['ir.config_parameter'].get_param(
'document_ocr.synchronous', 'False') == 'True'
has_force_flag = self.env.context.get('document_ocr_force')
data, datas_fname, file_type
)
if data and mimetype and (not content or content == "image"):
has_synchr_param = (
self.env["ir.config_parameter"].get_param(
"document_ocr.synchronous", "False"
)
== "True"
)
has_force_flag = self.env.context.get("document_ocr_force")
if has_synchr_param or has_force_flag:
content = self._index_ocr(mimetype, data, datas_fname,
file_type)
content = self._index_ocr(mimetype, data, datas_fname, file_type)
else:
content = _MARKER_PHRASE
@ -32,56 +37,62 @@ class IrAttachment(models.Model):
@api.model
def _index_ocr(self, mimetype, data, datas_fname, file_type):
dpi = int(
self.env['ir.config_parameter'].get_param(
'document_ocr.dpi', '500'))
if '/' not in mimetype:
_logger.warning('Invalid mimetype %s', mimetype)
dpi = int(self.env["ir.config_parameter"].get_param("document_ocr.dpi", "500"))
if "/" not in mimetype:
_logger.warning("Invalid mimetype %s", mimetype)
return None
top_type, sub_type = mimetype.split('/', 1)
if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
data, datas_fname, file_type, dpi)
top_type, sub_type = mimetype.split("/", 1)
if hasattr(self, "_index_ocr_get_data_%s" % sub_type):
image_data = getattr(self, "_index_ocr_get_data_%s" % sub_type)(
data, datas_fname, file_type, dpi
)
else:
image_data = StringIO()
try:
Image.open(StringIO(data)).save(image_data, 'png',
dpi=(dpi, dpi))
Image.open(StringIO(data)).save(image_data, "png", dpi=(dpi, dpi))
except IOError:
_logger.exception('Failed to OCR image')
_logger.exception("Failed to OCR image")
return None
process = subprocess.Popen(
['tesseract', 'stdin', 'stdout'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
["tesseract", "stdin", "stdout"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate(image_data.getvalue())
if process.returncode:
_logger.error('Error during OCR: %s', stderr)
_logger.error("Error during OCR: %s", stderr)
return stdout
@api.model
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
process = subprocess.Popen(
['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
["convert", "-density", str(dpi), "-", "-append", "png32:-"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate(data)
if stderr:
_logger.error('Error converting to PDF: %s', stderr)
_logger.error("Error converting to PDF: %s", stderr)
return StringIO(stdout)
@api.model
def _ocr_cron(self, limit=0):
for this in self.with_context(document_ocr_force=True).search([
('index_content', '=', _MARKER_PHRASE),
], limit=limit):
for this in self.with_context(document_ocr_force=True).search(
[
("index_content", "=", _MARKER_PHRASE),
],
limit=limit,
):
if not this.datas:
continue
file_type, index_content = this._index(
this.datas.decode('base64'), this.datas_fname, this.file_type)
this.write({
'file_type': file_type,
'index_content': index_content,
})
this.datas.decode("base64"), this.datas_fname, this.file_type
)
this.write(
{
"file_type": file_type,
"index_content": index_content,
}
)

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import test_document_ocr

View File

@ -1,58 +1,65 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from PIL import Image, ImageDraw, ImageFont
from StringIO import StringIO
from openerp.tests.common import TransactionCase
from odoo.tests.common import TransactionCase
from odoo.tools.misc import mute_logger
from ..models.ir_attachment import _MARKER_PHRASE
from openerp.tools.misc import mute_logger
class TestDocumentOcr(TransactionCase):
def test_document_ocr(self):
self.env['ir.config_parameter'].set_param(
'document_ocr.synchronous', 'True')
test_image = Image.new('RGB', (200, 30))
self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "True")
test_image = Image.new("RGB", (200, 30))
draw = ImageDraw.Draw(test_image)
draw.text((3, 3), "Hello world", font=ImageFont.truetype(
'/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24))
draw.text(
(3, 3),
"Hello world",
font=ImageFont.truetype(
"/usr/share/fonts/truetype/inconsolata/Inconsolata.otf", 24
),
)
# test a plain image
data = StringIO()
test_image.save(data, 'png')
result = self.env['ir.attachment']._index(
data.getvalue(), 'test.png', None)
self.assertEqual(result[1].strip(), 'Hello world')
test_image.save(data, "png")
result = self.env["ir.attachment"]._index(data.getvalue(), "test.png", None)
self.assertEqual(result[1].strip(), "Hello world")
# should also work for pdfs if supported, protect against
# ancient pillows
if hasattr(Image, 'registered_extensions') and\
'PDF' in Image.registered_extensions().values():
if (
hasattr(Image, "registered_extensions")
and "PDF" in Image.registered_extensions().values()
):
data = StringIO()
test_image.save(data, 'pdf', resolution=300)
result = self.env['ir.attachment']._index(
data.getvalue(), 'test.pdf', None)
self.assertEqual(result[1].strip(), 'Hello world')
test_image.save(data, "pdf", resolution=300)
result = self.env["ir.attachment"]._index(data.getvalue(), "test.pdf", None)
self.assertEqual(result[1].strip(), "Hello world")
# check cron
self.env['ir.config_parameter'].set_param(
'document_ocr.synchronous', 'False')
attachment = self.env['ir.attachment'].create({
'name': 'testattachment',
'datas': data.getvalue().encode('base64'),
})
self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "False")
attachment = self.env["ir.attachment"].create(
{
"name": "testattachment",
"datas": data.getvalue().encode("base64"),
}
)
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
attachment._ocr_cron()
self.assertEqual(attachment.index_content.strip(), 'Hello world')
self.assertEqual(attachment.index_content.strip(), "Hello world")
# and for an unreadable image, we expect an error
if hasattr(Image, 'registered_extensions') and\
'PALM' in Image.registered_extensions().values():
self.env['ir.config_parameter'].set_param(
'document_ocr.synchronous', 'True')
data = StringIO()
test_image = Image.new('1', (200, 30))
test_image.save(data, 'Palm')
with mute_logger(
'openerp.addons.document_ocr.models.ir_attachment'
if (
hasattr(Image, "registered_extensions")
and "PALM" in Image.registered_extensions().values()
):
result = self.env['ir.attachment']._index(
data.getvalue(), 'test.palm', None
self.env["ir.config_parameter"].set_param(
"document_ocr.synchronous", "True"
)
data = StringIO()
test_image = Image.new("1", (200, 30))
test_image.save(data, "Palm")
with mute_logger("openerp.addons.document_ocr.models.ir_attachment"):
result = self.env["ir.attachment"]._index(
data.getvalue(), "test.palm", None
)
self.assertEqual(result[1], None)

View File

@ -0,0 +1 @@
../../../../document_ocr

View File

@ -0,0 +1,6 @@
import setuptools
setuptools.setup(
setup_requires=['setuptools-odoo'],
odoo_addon=True,
)