[IMP] document_ocr: pre-commit execution

This commit is contained in:
len 2023-09-07 11:09:24 +02:00
parent a00735b210
commit f1f13f1e8b
10 changed files with 108 additions and 91 deletions

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import models from . import models

View File

@ -1,23 +1,22 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{ {
"name": "OCR for documents", "name": "OCR for documents",
"version": "8.0.1.0.0", "version": "16.0.1.0.0",
"author": "Therp BV,Odoo Community Association (OCA)", "author": "Therp BV,Odoo Community Association (OCA)",
"license": "AGPL-3", "license": "AGPL-3",
"category": "Knowledge Management", "category": "Knowledge Management",
"summary": "Run character recognition on uploaded files", "summary": "Run character recognition on uploaded files",
"depends": [ "depends": [
'document', "document",
], ],
"data": [ "data": [
"data/ir_cron.xml", "data/ir_cron.xml",
"data/ir_config_parameter.xml", "data/ir_config_parameter.xml",
], ],
"external_dependencies": { "external_dependencies": {
'bin': [ "bin": [
'tesseract', "tesseract",
], ],
}, },
} }

View File

@ -1,6 +1,5 @@
<?xml version="1.0" encoding="UTF-8" ?> <?xml version="1.0" encoding="UTF-8" ?>
<openerp> <odoo noupdate="1">
<data noupdate="1">
<record id="param_synchronous" model="ir.config_parameter"> <record id="param_synchronous" model="ir.config_parameter">
<field name="key">document_ocr.synchronous</field> <field name="key">document_ocr.synchronous</field>
<field name="value">False</field> <field name="value">False</field>
@ -9,5 +8,4 @@
<field name="key">document_ocr.dpi</field> <field name="key">document_ocr.dpi</field>
<field name="value">300</field> <field name="value">300</field>
</record> </record>
</data> </odoo>
</openerp>

View File

@ -1,6 +1,5 @@
<?xml version="1.0" encoding="UTF-8" ?> <?xml version="1.0" encoding="UTF-8" ?>
<openerp> <odoo noupdate="1">
<data noupdate="1">
<record id="cron" model="ir.cron"> <record id="cron" model="ir.cron">
<field name="name">Run OCR on uploaded documents</field> <field name="name">Run OCR on uploaded documents</field>
<field name="interval_type">days</field> <field name="interval_type">days</field>
@ -10,5 +9,4 @@
<field name="numbercall">-1</field> <field name="numbercall">-1</field>
<field name="args">(100,)</field> <field name="args">(100,)</field>
</record> </record>
</data> </odoo>
</openerp>

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import ir_attachment from . import ir_attachment

View File

@ -1,30 +1,35 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import logging import logging
import subprocess import subprocess
from PIL import Image from PIL import Image
from StringIO import StringIO from StringIO import StringIO
from openerp import api, models
from odoo import api, models
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
_MARKER_PHRASE = '[[waiting for OCR]]' _MARKER_PHRASE = "[[waiting for OCR]]"
class IrAttachment(models.Model): class IrAttachment(models.Model):
_inherit = 'ir.attachment' _inherit = "ir.attachment"
@api.model @api.model
def _index(self, data, datas_fname, file_type): def _index(self, data, datas_fname, file_type):
mimetype, content = super(IrAttachment, self)._index( mimetype, content = super(IrAttachment, self)._index(
data, datas_fname, file_type) data, datas_fname, file_type
if data and mimetype and (not content or content == 'image'): )
has_synchr_param = self.env['ir.config_parameter'].get_param( if data and mimetype and (not content or content == "image"):
'document_ocr.synchronous', 'False') == 'True' has_synchr_param = (
has_force_flag = self.env.context.get('document_ocr_force') self.env["ir.config_parameter"].get_param(
"document_ocr.synchronous", "False"
)
== "True"
)
has_force_flag = self.env.context.get("document_ocr_force")
if has_synchr_param or has_force_flag: if has_synchr_param or has_force_flag:
content = self._index_ocr(mimetype, data, datas_fname, content = self._index_ocr(mimetype, data, datas_fname, file_type)
file_type)
else: else:
content = _MARKER_PHRASE content = _MARKER_PHRASE
@ -32,56 +37,62 @@ class IrAttachment(models.Model):
@api.model @api.model
def _index_ocr(self, mimetype, data, datas_fname, file_type): def _index_ocr(self, mimetype, data, datas_fname, file_type):
dpi = int( dpi = int(self.env["ir.config_parameter"].get_param("document_ocr.dpi", "500"))
self.env['ir.config_parameter'].get_param( if "/" not in mimetype:
'document_ocr.dpi', '500')) _logger.warning("Invalid mimetype %s", mimetype)
if '/' not in mimetype:
_logger.warning('Invalid mimetype %s', mimetype)
return None return None
top_type, sub_type = mimetype.split('/', 1) top_type, sub_type = mimetype.split("/", 1)
if hasattr(self, '_index_ocr_get_data_%s' % sub_type): if hasattr(self, "_index_ocr_get_data_%s" % sub_type):
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)( image_data = getattr(self, "_index_ocr_get_data_%s" % sub_type)(
data, datas_fname, file_type, dpi) data, datas_fname, file_type, dpi
)
else: else:
image_data = StringIO() image_data = StringIO()
try: try:
Image.open(StringIO(data)).save(image_data, 'png', Image.open(StringIO(data)).save(image_data, "png", dpi=(dpi, dpi))
dpi=(dpi, dpi))
except IOError: except IOError:
_logger.exception('Failed to OCR image') _logger.exception("Failed to OCR image")
return None return None
process = subprocess.Popen( process = subprocess.Popen(
['tesseract', 'stdin', 'stdout'], ["tesseract", "stdin", "stdout"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
) )
stdout, stderr = process.communicate(image_data.getvalue()) stdout, stderr = process.communicate(image_data.getvalue())
if process.returncode: if process.returncode:
_logger.error('Error during OCR: %s', stderr) _logger.error("Error during OCR: %s", stderr)
return stdout return stdout
@api.model @api.model
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi): def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
process = subprocess.Popen( process = subprocess.Popen(
['convert', '-density', str(dpi), '-', '-append', 'png32:-'], ["convert", "-density", str(dpi), "-", "-append", "png32:-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
) )
stdout, stderr = process.communicate(data) stdout, stderr = process.communicate(data)
if stderr: if stderr:
_logger.error('Error converting to PDF: %s', stderr) _logger.error("Error converting to PDF: %s", stderr)
return StringIO(stdout) return StringIO(stdout)
@api.model @api.model
def _ocr_cron(self, limit=0): def _ocr_cron(self, limit=0):
for this in self.with_context(document_ocr_force=True).search([ for this in self.with_context(document_ocr_force=True).search(
('index_content', '=', _MARKER_PHRASE), [
], limit=limit): ("index_content", "=", _MARKER_PHRASE),
],
limit=limit,
):
if not this.datas: if not this.datas:
continue continue
file_type, index_content = this._index( file_type, index_content = this._index(
this.datas.decode('base64'), this.datas_fname, this.file_type) this.datas.decode("base64"), this.datas_fname, this.file_type
this.write({ )
'file_type': file_type, this.write(
'index_content': index_content, {
}) "file_type": file_type,
"index_content": index_content,
}
)

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import test_document_ocr from . import test_document_ocr

View File

@ -1,58 +1,65 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from StringIO import StringIO from StringIO import StringIO
from openerp.tests.common import TransactionCase
from odoo.tests.common import TransactionCase
from odoo.tools.misc import mute_logger
from ..models.ir_attachment import _MARKER_PHRASE from ..models.ir_attachment import _MARKER_PHRASE
from openerp.tools.misc import mute_logger
class TestDocumentOcr(TransactionCase): class TestDocumentOcr(TransactionCase):
def test_document_ocr(self): def test_document_ocr(self):
self.env['ir.config_parameter'].set_param( self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "True")
'document_ocr.synchronous', 'True') test_image = Image.new("RGB", (200, 30))
test_image = Image.new('RGB', (200, 30))
draw = ImageDraw.Draw(test_image) draw = ImageDraw.Draw(test_image)
draw.text((3, 3), "Hello world", font=ImageFont.truetype( draw.text(
'/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24)) (3, 3),
"Hello world",
font=ImageFont.truetype(
"/usr/share/fonts/truetype/inconsolata/Inconsolata.otf", 24
),
)
# test a plain image # test a plain image
data = StringIO() data = StringIO()
test_image.save(data, 'png') test_image.save(data, "png")
result = self.env['ir.attachment']._index( result = self.env["ir.attachment"]._index(data.getvalue(), "test.png", None)
data.getvalue(), 'test.png', None) self.assertEqual(result[1].strip(), "Hello world")
self.assertEqual(result[1].strip(), 'Hello world')
# should also work for pdfs if supported, protect against # should also work for pdfs if supported, protect against
# ancient pillows # ancient pillows
if hasattr(Image, 'registered_extensions') and\ if (
'PDF' in Image.registered_extensions().values(): hasattr(Image, "registered_extensions")
and "PDF" in Image.registered_extensions().values()
):
data = StringIO() data = StringIO()
test_image.save(data, 'pdf', resolution=300) test_image.save(data, "pdf", resolution=300)
result = self.env['ir.attachment']._index( result = self.env["ir.attachment"]._index(data.getvalue(), "test.pdf", None)
data.getvalue(), 'test.pdf', None) self.assertEqual(result[1].strip(), "Hello world")
self.assertEqual(result[1].strip(), 'Hello world')
# check cron # check cron
self.env['ir.config_parameter'].set_param( self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "False")
'document_ocr.synchronous', 'False') attachment = self.env["ir.attachment"].create(
attachment = self.env['ir.attachment'].create({ {
'name': 'testattachment', "name": "testattachment",
'datas': data.getvalue().encode('base64'), "datas": data.getvalue().encode("base64"),
}) }
)
self.assertEqual(attachment.index_content, _MARKER_PHRASE) self.assertEqual(attachment.index_content, _MARKER_PHRASE)
attachment._ocr_cron() attachment._ocr_cron()
self.assertEqual(attachment.index_content.strip(), 'Hello world') self.assertEqual(attachment.index_content.strip(), "Hello world")
# and for an unreadable image, we expect an error # and for an unreadable image, we expect an error
if hasattr(Image, 'registered_extensions') and\ if (
'PALM' in Image.registered_extensions().values(): hasattr(Image, "registered_extensions")
self.env['ir.config_parameter'].set_param( and "PALM" in Image.registered_extensions().values()
'document_ocr.synchronous', 'True')
data = StringIO()
test_image = Image.new('1', (200, 30))
test_image.save(data, 'Palm')
with mute_logger(
'openerp.addons.document_ocr.models.ir_attachment'
): ):
result = self.env['ir.attachment']._index( self.env["ir.config_parameter"].set_param(
data.getvalue(), 'test.palm', None "document_ocr.synchronous", "True"
)
data = StringIO()
test_image = Image.new("1", (200, 30))
test_image.save(data, "Palm")
with mute_logger("openerp.addons.document_ocr.models.ir_attachment"):
result = self.env["ir.attachment"]._index(
data.getvalue(), "test.palm", None
) )
self.assertEqual(result[1], None) self.assertEqual(result[1], None)

View File

@ -0,0 +1 @@
../../../../document_ocr

View File

@ -0,0 +1,6 @@
import setuptools
setuptools.setup(
setup_requires=['setuptools-odoo'],
odoo_addon=True,
)