From f1f13f1e8b41780dc76e30c9b179782078790104 Mon Sep 17 00:00:00 2001 From: len Date: Thu, 7 Sep 2023 11:09:24 +0200 Subject: [PATCH] [IMP] document_ocr: pre-commit execution --- document_ocr/__init__.py | 1 - .../{__openerp__.py => __manifest__.py} | 9 +- document_ocr/data/ir_config_parameter.xml | 8 +- document_ocr/data/ir_cron.xml | 8 +- document_ocr/models/__init__.py | 1 - document_ocr/models/ir_attachment.py | 85 +++++++++++-------- document_ocr/tests/__init__.py | 1 - document_ocr/tests/test_document_ocr.py | 79 +++++++++-------- setup/document_ocr/odoo/addons/document_ocr | 1 + setup/document_ocr/setup.py | 6 ++ 10 files changed, 108 insertions(+), 91 deletions(-) rename document_ocr/{__openerp__.py => __manifest__.py} (81%) create mode 120000 setup/document_ocr/odoo/addons/document_ocr create mode 100644 setup/document_ocr/setup.py diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py index 7eda98a2..463328a6 100644 --- a/document_ocr/__init__.py +++ b/document_ocr/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # © 2016 Therp BV # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import models diff --git a/document_ocr/__openerp__.py b/document_ocr/__manifest__.py similarity index 81% rename from document_ocr/__openerp__.py rename to document_ocr/__manifest__.py index 2c1a2696..5d2c5be9 100644 --- a/document_ocr/__openerp__.py +++ b/document_ocr/__manifest__.py @@ -1,23 +1,22 @@ -# -*- coding: utf-8 -*- # © 2016 Therp BV # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). { "name": "OCR for documents", - "version": "8.0.1.0.0", + "version": "16.0.1.0.0", "author": "Therp BV,Odoo Community Association (OCA)", "license": "AGPL-3", "category": "Knowledge Management", "summary": "Run character recognition on uploaded files", "depends": [ - 'document', + "document", ], "data": [ "data/ir_cron.xml", "data/ir_config_parameter.xml", ], "external_dependencies": { - 'bin': [ - 'tesseract', + "bin": [ + "tesseract", ], }, } diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml index e46db18a..e18fab65 100644 --- a/document_ocr/data/ir_config_parameter.xml +++ b/document_ocr/data/ir_config_parameter.xml @@ -1,6 +1,5 @@ - - - + + document_ocr.synchronous False @@ -9,5 +8,4 @@ document_ocr.dpi 300 - - + diff --git a/document_ocr/data/ir_cron.xml b/document_ocr/data/ir_cron.xml index 1ea8cd20..12d3492a 100644 --- a/document_ocr/data/ir_cron.xml +++ b/document_ocr/data/ir_cron.xml @@ -1,6 +1,5 @@ - - - + + Run OCR on uploaded documents days @@ -10,5 +9,4 @@ -1 (100,) - - + diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py index a15f1b21..b8d4d034 100644 --- a/document_ocr/models/__init__.py +++ b/document_ocr/models/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # © 2016 Therp BV # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import ir_attachment diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py index 98f40831..3cdb93d8 100644 --- a/document_ocr/models/ir_attachment.py +++ b/document_ocr/models/ir_attachment.py @@ -1,30 +1,35 @@ -# -*- coding: utf-8 -*- # © 2016 Therp BV # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). import logging import subprocess + from PIL import Image from StringIO import StringIO -from openerp import api, models + +from odoo import api, models _logger = logging.getLogger(__name__) -_MARKER_PHRASE = '[[waiting for OCR]]' +_MARKER_PHRASE = "[[waiting for OCR]]" class IrAttachment(models.Model): - _inherit = 'ir.attachment' + _inherit = "ir.attachment" @api.model def _index(self, data, datas_fname, file_type): mimetype, content = super(IrAttachment, self)._index( - data, datas_fname, file_type) - if data and mimetype and (not content or content == 'image'): - has_synchr_param = self.env['ir.config_parameter'].get_param( - 'document_ocr.synchronous', 'False') == 'True' - has_force_flag = self.env.context.get('document_ocr_force') + data, datas_fname, file_type + ) + if data and mimetype and (not content or content == "image"): + has_synchr_param = ( + self.env["ir.config_parameter"].get_param( + "document_ocr.synchronous", "False" + ) + == "True" + ) + has_force_flag = self.env.context.get("document_ocr_force") if has_synchr_param or has_force_flag: - content = self._index_ocr(mimetype, data, datas_fname, - file_type) + content = self._index_ocr(mimetype, data, datas_fname, file_type) else: content = _MARKER_PHRASE @@ -32,56 +37,62 @@ class IrAttachment(models.Model): @api.model def _index_ocr(self, mimetype, data, datas_fname, file_type): - dpi = int( - self.env['ir.config_parameter'].get_param( - 'document_ocr.dpi', '500')) - if '/' not in mimetype: - _logger.warning('Invalid mimetype %s', mimetype) + dpi = int(self.env["ir.config_parameter"].get_param("document_ocr.dpi", "500")) + if "/" not in mimetype: + _logger.warning("Invalid mimetype %s", mimetype) return None - top_type, sub_type = mimetype.split('/', 1) - if hasattr(self, '_index_ocr_get_data_%s' % sub_type): - image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)( - data, datas_fname, file_type, dpi) + top_type, sub_type = mimetype.split("/", 1) + if hasattr(self, "_index_ocr_get_data_%s" % sub_type): + image_data = getattr(self, "_index_ocr_get_data_%s" % sub_type)( + data, datas_fname, file_type, dpi + ) else: image_data = StringIO() try: - Image.open(StringIO(data)).save(image_data, 'png', - dpi=(dpi, dpi)) + Image.open(StringIO(data)).save(image_data, "png", dpi=(dpi, dpi)) except IOError: - _logger.exception('Failed to OCR image') + _logger.exception("Failed to OCR image") return None process = subprocess.Popen( - ['tesseract', 'stdin', 'stdout'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, + ["tesseract", "stdin", "stdout"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) stdout, stderr = process.communicate(image_data.getvalue()) if process.returncode: - _logger.error('Error during OCR: %s', stderr) + _logger.error("Error during OCR: %s", stderr) return stdout @api.model def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi): process = subprocess.Popen( - ['convert', '-density', str(dpi), '-', '-append', 'png32:-'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, + ["convert", "-density", str(dpi), "-", "-append", "png32:-"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) stdout, stderr = process.communicate(data) if stderr: - _logger.error('Error converting to PDF: %s', stderr) + _logger.error("Error converting to PDF: %s", stderr) return StringIO(stdout) @api.model def _ocr_cron(self, limit=0): - for this in self.with_context(document_ocr_force=True).search([ - ('index_content', '=', _MARKER_PHRASE), - ], limit=limit): + for this in self.with_context(document_ocr_force=True).search( + [ + ("index_content", "=", _MARKER_PHRASE), + ], + limit=limit, + ): if not this.datas: continue file_type, index_content = this._index( - this.datas.decode('base64'), this.datas_fname, this.file_type) - this.write({ - 'file_type': file_type, - 'index_content': index_content, - }) + this.datas.decode("base64"), this.datas_fname, this.file_type + ) + this.write( + { + "file_type": file_type, + "index_content": index_content, + } + ) diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py index 7bdf742c..059af577 100644 --- a/document_ocr/tests/__init__.py +++ b/document_ocr/tests/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # © 2016 Therp BV # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from . import test_document_ocr diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py index d4545ac2..c0084b81 100644 --- a/document_ocr/tests/test_document_ocr.py +++ b/document_ocr/tests/test_document_ocr.py @@ -1,58 +1,65 @@ -# -*- coding: utf-8 -*- # © 2016 Therp BV # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). from PIL import Image, ImageDraw, ImageFont from StringIO import StringIO -from openerp.tests.common import TransactionCase + +from odoo.tests.common import TransactionCase +from odoo.tools.misc import mute_logger + from ..models.ir_attachment import _MARKER_PHRASE -from openerp.tools.misc import mute_logger class TestDocumentOcr(TransactionCase): def test_document_ocr(self): - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'True') - test_image = Image.new('RGB', (200, 30)) + self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "True") + test_image = Image.new("RGB", (200, 30)) draw = ImageDraw.Draw(test_image) - draw.text((3, 3), "Hello world", font=ImageFont.truetype( - '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24)) + draw.text( + (3, 3), + "Hello world", + font=ImageFont.truetype( + "/usr/share/fonts/truetype/inconsolata/Inconsolata.otf", 24 + ), + ) # test a plain image data = StringIO() - test_image.save(data, 'png') - result = self.env['ir.attachment']._index( - data.getvalue(), 'test.png', None) - self.assertEqual(result[1].strip(), 'Hello world') + test_image.save(data, "png") + result = self.env["ir.attachment"]._index(data.getvalue(), "test.png", None) + self.assertEqual(result[1].strip(), "Hello world") # should also work for pdfs if supported, protect against # ancient pillows - if hasattr(Image, 'registered_extensions') and\ - 'PDF' in Image.registered_extensions().values(): + if ( + hasattr(Image, "registered_extensions") + and "PDF" in Image.registered_extensions().values() + ): data = StringIO() - test_image.save(data, 'pdf', resolution=300) - result = self.env['ir.attachment']._index( - data.getvalue(), 'test.pdf', None) - self.assertEqual(result[1].strip(), 'Hello world') + test_image.save(data, "pdf", resolution=300) + result = self.env["ir.attachment"]._index(data.getvalue(), "test.pdf", None) + self.assertEqual(result[1].strip(), "Hello world") # check cron - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'False') - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas': data.getvalue().encode('base64'), - }) + self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "False") + attachment = self.env["ir.attachment"].create( + { + "name": "testattachment", + "datas": data.getvalue().encode("base64"), + } + ) self.assertEqual(attachment.index_content, _MARKER_PHRASE) attachment._ocr_cron() - self.assertEqual(attachment.index_content.strip(), 'Hello world') + self.assertEqual(attachment.index_content.strip(), "Hello world") # and for an unreadable image, we expect an error - if hasattr(Image, 'registered_extensions') and\ - 'PALM' in Image.registered_extensions().values(): - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'True') + if ( + hasattr(Image, "registered_extensions") + and "PALM" in Image.registered_extensions().values() + ): + self.env["ir.config_parameter"].set_param( + "document_ocr.synchronous", "True" + ) data = StringIO() - test_image = Image.new('1', (200, 30)) - test_image.save(data, 'Palm') - with mute_logger( - 'openerp.addons.document_ocr.models.ir_attachment' - ): - result = self.env['ir.attachment']._index( - data.getvalue(), 'test.palm', None + test_image = Image.new("1", (200, 30)) + test_image.save(data, "Palm") + with mute_logger("openerp.addons.document_ocr.models.ir_attachment"): + result = self.env["ir.attachment"]._index( + data.getvalue(), "test.palm", None ) self.assertEqual(result[1], None) diff --git a/setup/document_ocr/odoo/addons/document_ocr b/setup/document_ocr/odoo/addons/document_ocr new file mode 120000 index 00000000..142c7cf3 --- /dev/null +++ b/setup/document_ocr/odoo/addons/document_ocr @@ -0,0 +1 @@ +../../../../document_ocr \ No newline at end of file diff --git a/setup/document_ocr/setup.py b/setup/document_ocr/setup.py new file mode 100644 index 00000000..28c57bb6 --- /dev/null +++ b/setup/document_ocr/setup.py @@ -0,0 +1,6 @@ +import setuptools + +setuptools.setup( + setup_requires=['setuptools-odoo'], + odoo_addon=True, +)