From 5edbe1685ba606d91ac4be0edaef4638d33f5f4d Mon Sep 17 00:00:00 2001 From: len Date: Thu, 7 Sep 2023 10:34:42 +0200 Subject: [PATCH] [MIG] document_ocr -> attachment_indexation_ocr --- attachment_indexation_ocr/README.rst | 107 +++++ .../__init__.py | 0 .../__manifest__.py | 11 +- .../data/ir_config_parameter.xml | 4 +- .../data/ir_cron.xml | 6 +- .../models/__init__.py | 0 .../models/ir_attachment.py | 100 ++++ .../readme/CONFIGURE.rst | 3 + .../readme/CONTRIBUTORS.rst | 2 + .../readme/DESCRIPTION.rst | 3 + attachment_indexation_ocr/readme/INSTALL.rst | 9 + attachment_indexation_ocr/readme/USAGE.rst | 4 + .../static/description/icon.png | Bin .../static/description/index.html | 453 ++++++++++++++++++ .../tests/__init__.py | 0 .../tests/test_document_ocr.py | 65 +++ document_ocr/README.rst | 86 ---- document_ocr/models/ir_attachment.py | 98 ---- document_ocr/tests/test_document_ocr.py | 65 --- .../odoo/addons/attachment_indexation_ocr | 1 + .../setup.py | 0 setup/document_ocr/odoo/addons/document_ocr | 1 - 22 files changed, 755 insertions(+), 263 deletions(-) create mode 100644 attachment_indexation_ocr/README.rst rename {document_ocr => attachment_indexation_ocr}/__init__.py (100%) rename {document_ocr => attachment_indexation_ocr}/__manifest__.py (75%) rename {document_ocr => attachment_indexation_ocr}/data/ir_config_parameter.xml (73%) rename {document_ocr => attachment_indexation_ocr}/data/ir_cron.xml (66%) rename {document_ocr => attachment_indexation_ocr}/models/__init__.py (100%) create mode 100644 attachment_indexation_ocr/models/ir_attachment.py create mode 100644 attachment_indexation_ocr/readme/CONFIGURE.rst create mode 100644 attachment_indexation_ocr/readme/CONTRIBUTORS.rst create mode 100644 attachment_indexation_ocr/readme/DESCRIPTION.rst create mode 100644 attachment_indexation_ocr/readme/INSTALL.rst create mode 100644 attachment_indexation_ocr/readme/USAGE.rst rename {document_ocr => attachment_indexation_ocr}/static/description/icon.png (100%) create mode 100644 attachment_indexation_ocr/static/description/index.html rename {document_ocr => attachment_indexation_ocr}/tests/__init__.py (100%) create mode 100644 attachment_indexation_ocr/tests/test_document_ocr.py delete mode 100644 document_ocr/README.rst delete mode 100644 document_ocr/models/ir_attachment.py delete mode 100644 document_ocr/tests/test_document_ocr.py create mode 120000 setup/attachment_indexation_ocr/odoo/addons/attachment_indexation_ocr rename setup/{document_ocr => attachment_indexation_ocr}/setup.py (100%) delete mode 120000 setup/document_ocr/odoo/addons/document_ocr diff --git a/attachment_indexation_ocr/README.rst b/attachment_indexation_ocr/README.rst new file mode 100644 index 00000000..1fe07ecb --- /dev/null +++ b/attachment_indexation_ocr/README.rst @@ -0,0 +1,107 @@ +================= +OCR for documents +================= + +.. + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + !! This file is generated by oca-gen-addon-readme !! + !! changes will be overwritten. !! + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + !! source digest: sha256:488ceb3b031015c08770a769f1357f5dcd462d28eaca37048790a61ef9a5feab + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +.. |badge1| image:: https://img.shields.io/badge/maturity-Beta-yellow.png + :target: https://odoo-community.org/page/development-status + :alt: Beta +.. |badge2| image:: https://img.shields.io/badge/licence-AGPL--3-blue.png + :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html + :alt: License: AGPL-3 +.. |badge3| image:: https://img.shields.io/badge/github-OCA%2Fknowledge-lightgray.png?logo=github + :target: https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr + :alt: OCA/knowledge +.. |badge4| image:: https://img.shields.io/badge/weblate-Translate%20me-F47D42.png + :target: https://translation.odoo-community.org/projects/knowledge-16-0/knowledge-16-0-attachment_indexation_ocr + :alt: Translate me on Weblate +.. |badge5| image:: https://img.shields.io/badge/runboat-Try%20me-875A7B.png + :target: https://runboat.odoo-community.org/builds?repo=OCA/knowledge&target_branch=16.0 + :alt: Try me on Runboat + +|badge1| |badge2| |badge3| |badge4| |badge5| + +This module was written to make uploaded documents, for example scans, searchable by running OCR on them. + +It supports all image formats `Pillow supports `_ for reading and PDFs. + +**Table of contents** + +.. contents:: + :local: + +Installation +============ + +To install this module, you need to: + +#. install tesseract and the language(s) your documents use +#. if you want to support OCR on PDFs, install imagemagick +#. install the module itself + +On an Debian or Ubuntu system you would typically run:: + + $ sudo apt-get install tesseract-ocr imagemagick + +Configuration +============= + +To configure this module, go to: + +#. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.* + +Usage +===== + +By default, character recognition is done asynchronously by a cronjob at night. +This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish. +The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. +In case you want to force the OCR to be done immediately, set configuration parameter ``ocr.synchronous`` to value ``True``. + +Bug Tracker +=========== + +Bugs are tracked on `GitHub Issues `_. +In case of trouble, please check there if your issue has already been reported. +If you spotted it first, help us to smash it by providing a detailed and welcomed +`feedback `_. + +Do not contact contributors directly about support or help with technical issues. + +Credits +======= + +Authors +~~~~~~~ + +* Therp BV + +Contributors +~~~~~~~~~~~~ + +* Holger Brunn +* len-foss + +Maintainers +~~~~~~~~~~~ + +This module is maintained by the OCA. + +.. image:: https://odoo-community.org/logo.png + :alt: Odoo Community Association + :target: https://odoo-community.org + +OCA, or the Odoo Community Association, is a nonprofit organization whose +mission is to support the collaborative development of Odoo features and +promote its widespread use. + +This module is part of the `OCA/knowledge `_ project on GitHub. + +You are welcome to contribute. To learn how please visit https://odoo-community.org/page/Contribute. diff --git a/document_ocr/__init__.py b/attachment_indexation_ocr/__init__.py similarity index 100% rename from document_ocr/__init__.py rename to attachment_indexation_ocr/__init__.py diff --git a/document_ocr/__manifest__.py b/attachment_indexation_ocr/__manifest__.py similarity index 75% rename from document_ocr/__manifest__.py rename to attachment_indexation_ocr/__manifest__.py index 5d2c5be9..3c2f0e0a 100644 --- a/document_ocr/__manifest__.py +++ b/attachment_indexation_ocr/__manifest__.py @@ -5,18 +5,13 @@ "version": "16.0.1.0.0", "author": "Therp BV,Odoo Community Association (OCA)", "license": "AGPL-3", + "website": "https://github.com/OCA/knowledge", "category": "Knowledge Management", "summary": "Run character recognition on uploaded files", - "depends": [ - "document", - ], + "depends": ["attachment_indexation"], "data": [ "data/ir_cron.xml", "data/ir_config_parameter.xml", ], - "external_dependencies": { - "bin": [ - "tesseract", - ], - }, + "external_dependencies": {"bin": ["tesseract"]}, } diff --git a/document_ocr/data/ir_config_parameter.xml b/attachment_indexation_ocr/data/ir_config_parameter.xml similarity index 73% rename from document_ocr/data/ir_config_parameter.xml rename to attachment_indexation_ocr/data/ir_config_parameter.xml index e18fab65..83215491 100644 --- a/document_ocr/data/ir_config_parameter.xml +++ b/attachment_indexation_ocr/data/ir_config_parameter.xml @@ -1,11 +1,11 @@ - document_ocr.synchronous + ocr.synchronous False - document_ocr.dpi + ocr.dpi 300 diff --git a/document_ocr/data/ir_cron.xml b/attachment_indexation_ocr/data/ir_cron.xml similarity index 66% rename from document_ocr/data/ir_cron.xml rename to attachment_indexation_ocr/data/ir_cron.xml index 12d3492a..241c9850 100644 --- a/document_ocr/data/ir_cron.xml +++ b/attachment_indexation_ocr/data/ir_cron.xml @@ -4,9 +4,9 @@ Run OCR on uploaded documents days 1 - ir.attachment - _ocr_cron + + code + model._ocr_cron(limit=100) -1 - (100,) diff --git a/document_ocr/models/__init__.py b/attachment_indexation_ocr/models/__init__.py similarity index 100% rename from document_ocr/models/__init__.py rename to attachment_indexation_ocr/models/__init__.py diff --git a/attachment_indexation_ocr/models/ir_attachment.py b/attachment_indexation_ocr/models/ir_attachment.py new file mode 100644 index 00000000..03f0613b --- /dev/null +++ b/attachment_indexation_ocr/models/ir_attachment.py @@ -0,0 +1,100 @@ +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +import base64 +import logging +import subprocess +from io import BytesIO + +from PIL import Image + +from odoo import api, models + +_logger = logging.getLogger(__name__) +_MARKER_PHRASE = "[[waiting for OCR]]" + + +class IrAttachment(models.Model): + _inherit = "ir.attachment" + + @api.model + def _get_no_content_strings(self): + return ["image", "application"] + + @api.model + def _not_content(self, text): + return not text or text in self._get_no_content_strings() + + @api.model + def _index(self, bin_data, file_type, checksum=None): + content = super()._index(bin_data, file_type, checksum) + if bin_data and file_type and self._not_content(content): + synchronous = self.env["ir.config_parameter"].get_param("ocr.synchronous") + if synchronous == "True" or self.env.context.get("ocr_force"): + content = self._index_ocr(bin_data, file_type) + else: + content = _MARKER_PHRASE + return content + + @api.model + def _index_ocr(self, bin_data, file_type, dpi=0): + if not dpi: + icp = self.env["ir.config_parameter"] + dpi = int(icp.get_param("ocr.dpi", "500")) + if "/" not in file_type: + _logger.warning("Invalid mimetype %s", file_type) + return None + top_type, sub_type = file_type.split("/", 1) + if sub_type == "pdf": + # tesseract only supports image of at most 32K pixels + # depending on the number of pages, we have to either split + # into different batches or reduce the dpi; + # The maximum width and height are 32767. + image_data = self._index_ocr_get_data_pdf(bin_data, dpi) # TODO + else: + image_data = BytesIO() + try: + i = Image.open(BytesIO(bin_data)) + i.save(image_data, "png", dpi=(dpi, dpi)) + except IOError: + _logger.exception("Failed to OCR image") + return None + process = subprocess.Popen( + ["tesseract", "stdin", "stdout"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(image_data.getvalue()) + if process.returncode: + _logger.error("Error during OCR: %s", stderr) + return stdout.decode("utf-8") + + @api.model + def _index_ocr_get_data_pdf(self, bin_data, dpi): + process = subprocess.Popen( + ["convert", "-density", str(dpi), "-", "-append", "png32:-"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(bin_data) + if stderr: + _logger.error("Error converting to PDF: %s", stderr) + return BytesIO(stdout) + + @api.model + def _ocr_cron(self, limit=None): + domain = [("index_content", "=", _MARKER_PHRASE)] + recs = self.with_context(ocr_force=True).search(domain, limit=limit) + recs.perform_ocr() + + def perform_ocr(self): + for rec in self: + if not rec.datas: + index_content = "" # the _MARKER_PHRASE should be removed + else: + bin_data = base64.b64decode(rec.datas) + ctx = {"ocr_force": True} + index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype) + rec.write({"index_content": index_content}) diff --git a/attachment_indexation_ocr/readme/CONFIGURE.rst b/attachment_indexation_ocr/readme/CONFIGURE.rst new file mode 100644 index 00000000..4d0c4a22 --- /dev/null +++ b/attachment_indexation_ocr/readme/CONFIGURE.rst @@ -0,0 +1,3 @@ +To configure this module, go to: + +#. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.* diff --git a/attachment_indexation_ocr/readme/CONTRIBUTORS.rst b/attachment_indexation_ocr/readme/CONTRIBUTORS.rst new file mode 100644 index 00000000..c9f9d4dc --- /dev/null +++ b/attachment_indexation_ocr/readme/CONTRIBUTORS.rst @@ -0,0 +1,2 @@ +* Holger Brunn +* len-foss diff --git a/attachment_indexation_ocr/readme/DESCRIPTION.rst b/attachment_indexation_ocr/readme/DESCRIPTION.rst new file mode 100644 index 00000000..74068ce0 --- /dev/null +++ b/attachment_indexation_ocr/readme/DESCRIPTION.rst @@ -0,0 +1,3 @@ +This module was written to make uploaded documents, for example scans, searchable by running OCR on them. + +It supports all image formats `Pillow supports `_ for reading and PDFs. diff --git a/attachment_indexation_ocr/readme/INSTALL.rst b/attachment_indexation_ocr/readme/INSTALL.rst new file mode 100644 index 00000000..84b6e1ec --- /dev/null +++ b/attachment_indexation_ocr/readme/INSTALL.rst @@ -0,0 +1,9 @@ +To install this module, you need to: + +#. install tesseract and the language(s) your documents use +#. if you want to support OCR on PDFs, install imagemagick +#. install the module itself + +On an Debian or Ubuntu system you would typically run:: + + $ sudo apt-get install tesseract-ocr imagemagick diff --git a/attachment_indexation_ocr/readme/USAGE.rst b/attachment_indexation_ocr/readme/USAGE.rst new file mode 100644 index 00000000..128591e2 --- /dev/null +++ b/attachment_indexation_ocr/readme/USAGE.rst @@ -0,0 +1,4 @@ +By default, character recognition is done asynchronously by a cronjob at night. +This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish. +The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. +In case you want to force the OCR to be done immediately, set configuration parameter ``ocr.synchronous`` to value ``True``. diff --git a/document_ocr/static/description/icon.png b/attachment_indexation_ocr/static/description/icon.png similarity index 100% rename from document_ocr/static/description/icon.png rename to attachment_indexation_ocr/static/description/icon.png diff --git a/attachment_indexation_ocr/static/description/index.html b/attachment_indexation_ocr/static/description/index.html new file mode 100644 index 00000000..5690b115 --- /dev/null +++ b/attachment_indexation_ocr/static/description/index.html @@ -0,0 +1,453 @@ + + + + + + +OCR for documents + + + +
+

OCR for documents

+ + +

Beta License: AGPL-3 OCA/knowledge Translate me on Weblate Try me on Runboat

+

This module was written to make uploaded documents, for example scans, searchable by running OCR on them.

+

It supports all image formats Pillow supports for reading and PDFs.

+

Table of contents

+ +
+

Installation

+

To install this module, you need to:

+
    +
  1. install tesseract and the language(s) your documents use
  2. +
  3. if you want to support OCR on PDFs, install imagemagick
  4. +
  5. install the module itself
  6. +
+

On an Debian or Ubuntu system you would typically run:

+
+$ sudo apt-get install tesseract-ocr imagemagick
+
+
+
+

Configuration

+

To configure this module, go to:

+
    +
  1. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*
  2. +
+
+
+

Usage

+

By default, character recognition is done asynchronously by a cronjob at night. +This is because the recognition process takes a while and you don’t want to make your users wait for the indexation to finish. +The interval to run the cronjob can be adjusted to your needs in the Scheduled Actions menu, under ` Settings`. +In case you want to force the OCR to be done immediately, set configuration parameter ocr.synchronous to value True.

+
+
+

Bug Tracker

+

Bugs are tracked on GitHub Issues. +In case of trouble, please check there if your issue has already been reported. +If you spotted it first, help us to smash it by providing a detailed and welcomed +feedback.

+

Do not contact contributors directly about support or help with technical issues.

+
+
+

Credits

+
+

Authors

+
    +
  • Therp BV
  • +
+
+
+

Contributors

+ +
+
+

Maintainers

+

This module is maintained by the OCA.

+Odoo Community Association +

OCA, or the Odoo Community Association, is a nonprofit organization whose +mission is to support the collaborative development of Odoo features and +promote its widespread use.

+

This module is part of the OCA/knowledge project on GitHub.

+

You are welcome to contribute. To learn how please visit https://odoo-community.org/page/Contribute.

+
+
+
+ + diff --git a/document_ocr/tests/__init__.py b/attachment_indexation_ocr/tests/__init__.py similarity index 100% rename from document_ocr/tests/__init__.py rename to attachment_indexation_ocr/tests/__init__.py diff --git a/attachment_indexation_ocr/tests/test_document_ocr.py b/attachment_indexation_ocr/tests/test_document_ocr.py new file mode 100644 index 00000000..70df3d0e --- /dev/null +++ b/attachment_indexation_ocr/tests/test_document_ocr.py @@ -0,0 +1,65 @@ +# © 2016 Therp BV +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +import base64 +import subprocess +from io import BytesIO + +from PIL import Image, ImageDraw, ImageFont + +from odoo.tests.common import TransactionCase + +from ..models.ir_attachment import _MARKER_PHRASE + + +def _get_some_system_font(): + """Get a font that is available on the system""" + output = subprocess.check_output(["fc-list"]) + for line in output.splitlines(): + line = line.decode("utf-8") + if "otf" in line.lower() and "roman" in line.lower(): + return line.split(":")[0] + raise RuntimeError("No suitable font found!") + + +font_path = _get_some_system_font() +ir_config_parameter_key = "ocr.synchronous" +result_string = "Hello world" + + +def _get_image_data(frmt="png"): + test_image = Image.new("RGB", (200, 30)) + draw = ImageDraw.Draw(test_image) + draw.text((3, 3), result_string, font=ImageFont.truetype(font_path, 24)) + data = BytesIO() + test_image.save(data, frmt) + return data.getvalue() + + +class TestDocumentOcr(TransactionCase): + def test_document_ocr_png(self): + self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True") + bin_data = _get_image_data("png") + result = self.env["ir.attachment"]._index(bin_data, "image/png") + self.assertEqual(result.strip(), result_string) + + def test_document_ocr_ppm(self): + """It works on images that don't have a specific mimetype""" + self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True") + bin_data = _get_image_data("ppm") + result = self.env["ir.attachment"]._index(bin_data, "application/octet-stream") + self.assertEqual(result.strip(), result_string) + + def test_document_ocr_pdf(self): + self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True") + bin_data = _get_image_data("pdf") + result = self.env["ir.attachment"]._index(bin_data, "application/pdf") + self.assertEqual(result.strip(), result_string) + + def test_document_ocr_cron(self): + self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "False") + bin_data = _get_image_data("png") + vals = {"name": "testattachment", "datas": base64.b64encode(bin_data)} + attachment = self.env["ir.attachment"].create(vals) + self.assertEqual(attachment.index_content, _MARKER_PHRASE) + attachment._ocr_cron() + self.assertEqual(attachment.index_content.strip(), result_string) diff --git a/document_ocr/README.rst b/document_ocr/README.rst deleted file mode 100644 index c4d667f8..00000000 --- a/document_ocr/README.rst +++ /dev/null @@ -1,86 +0,0 @@ -.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg - :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html - :alt: License: AGPL-3 - -================= -OCR for documents -================= - -This module was written to make uploaded documents, for example scans, searchable by running OCR on them. - -It supports all image formats `Pillow supports `_ for reading and PDFs. - -Installation -============ - -To install this module, you need to: - -#. install tesseract and the language(s) your documents use -#. if you want to support OCR on PDFs, install imagemagick -#. install the module itself - -On an Debian or Ubuntu system you would typically run:: - - $ sudo apt-get install tesseract-ocr imagemagick - - -Configuration -============= - -To configure this module, go to: - -#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.* - -Usage -===== - -By default, character recognition is done asynchronously by a cronjob at night. -This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish. -The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. -In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. - -.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas - :alt: Try me on Runbot - :target: https://runbot.odoo-community.org/runbot/118/8.0 - -Bug Tracker -=========== - -Bugs are tracked on `GitHub Issues `_. -In case of trouble, please check there if your issue has already been reported. -If you spotted it first, help us smashing it by providing a detailed and welcomed feedback. - -Credits -======= - -The actual work ---------------- - -* `tesseract `_ - -Images ------- - -* Odoo Community Association: `Icon `_. - -Contributors ------------- - -* Holger Brunn - -Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list `_ or the `appropriate specialized mailinglist `_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues. - -Maintainer ----------- - -.. image:: https://odoo-community.org/logo.png - :alt: Odoo Community Association - :target: https://odoo-community.org - -This module is maintained by the OCA. - -OCA, or the Odoo Community Association, is a nonprofit organization whose -mission is to support the collaborative development of Odoo features and -promote its widespread use. - -To contribute to this module, please visit https://odoo-community.org. diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py deleted file mode 100644 index 3cdb93d8..00000000 --- a/document_ocr/models/ir_attachment.py +++ /dev/null @@ -1,98 +0,0 @@ -# © 2016 Therp BV -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -import logging -import subprocess - -from PIL import Image -from StringIO import StringIO - -from odoo import api, models - -_logger = logging.getLogger(__name__) -_MARKER_PHRASE = "[[waiting for OCR]]" - - -class IrAttachment(models.Model): - _inherit = "ir.attachment" - - @api.model - def _index(self, data, datas_fname, file_type): - mimetype, content = super(IrAttachment, self)._index( - data, datas_fname, file_type - ) - if data and mimetype and (not content or content == "image"): - has_synchr_param = ( - self.env["ir.config_parameter"].get_param( - "document_ocr.synchronous", "False" - ) - == "True" - ) - has_force_flag = self.env.context.get("document_ocr_force") - if has_synchr_param or has_force_flag: - content = self._index_ocr(mimetype, data, datas_fname, file_type) - else: - content = _MARKER_PHRASE - - return mimetype, content - - @api.model - def _index_ocr(self, mimetype, data, datas_fname, file_type): - dpi = int(self.env["ir.config_parameter"].get_param("document_ocr.dpi", "500")) - if "/" not in mimetype: - _logger.warning("Invalid mimetype %s", mimetype) - return None - top_type, sub_type = mimetype.split("/", 1) - if hasattr(self, "_index_ocr_get_data_%s" % sub_type): - image_data = getattr(self, "_index_ocr_get_data_%s" % sub_type)( - data, datas_fname, file_type, dpi - ) - else: - image_data = StringIO() - try: - Image.open(StringIO(data)).save(image_data, "png", dpi=(dpi, dpi)) - except IOError: - _logger.exception("Failed to OCR image") - return None - process = subprocess.Popen( - ["tesseract", "stdin", "stdout"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate(image_data.getvalue()) - if process.returncode: - _logger.error("Error during OCR: %s", stderr) - return stdout - - @api.model - def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi): - process = subprocess.Popen( - ["convert", "-density", str(dpi), "-", "-append", "png32:-"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate(data) - if stderr: - _logger.error("Error converting to PDF: %s", stderr) - return StringIO(stdout) - - @api.model - def _ocr_cron(self, limit=0): - for this in self.with_context(document_ocr_force=True).search( - [ - ("index_content", "=", _MARKER_PHRASE), - ], - limit=limit, - ): - if not this.datas: - continue - file_type, index_content = this._index( - this.datas.decode("base64"), this.datas_fname, this.file_type - ) - this.write( - { - "file_type": file_type, - "index_content": index_content, - } - ) diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py deleted file mode 100644 index c0084b81..00000000 --- a/document_ocr/tests/test_document_ocr.py +++ /dev/null @@ -1,65 +0,0 @@ -# © 2016 Therp BV -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from PIL import Image, ImageDraw, ImageFont -from StringIO import StringIO - -from odoo.tests.common import TransactionCase -from odoo.tools.misc import mute_logger - -from ..models.ir_attachment import _MARKER_PHRASE - - -class TestDocumentOcr(TransactionCase): - def test_document_ocr(self): - self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "True") - test_image = Image.new("RGB", (200, 30)) - draw = ImageDraw.Draw(test_image) - draw.text( - (3, 3), - "Hello world", - font=ImageFont.truetype( - "/usr/share/fonts/truetype/inconsolata/Inconsolata.otf", 24 - ), - ) - # test a plain image - data = StringIO() - test_image.save(data, "png") - result = self.env["ir.attachment"]._index(data.getvalue(), "test.png", None) - self.assertEqual(result[1].strip(), "Hello world") - # should also work for pdfs if supported, protect against - # ancient pillows - if ( - hasattr(Image, "registered_extensions") - and "PDF" in Image.registered_extensions().values() - ): - data = StringIO() - test_image.save(data, "pdf", resolution=300) - result = self.env["ir.attachment"]._index(data.getvalue(), "test.pdf", None) - self.assertEqual(result[1].strip(), "Hello world") - # check cron - self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "False") - attachment = self.env["ir.attachment"].create( - { - "name": "testattachment", - "datas": data.getvalue().encode("base64"), - } - ) - self.assertEqual(attachment.index_content, _MARKER_PHRASE) - attachment._ocr_cron() - self.assertEqual(attachment.index_content.strip(), "Hello world") - # and for an unreadable image, we expect an error - if ( - hasattr(Image, "registered_extensions") - and "PALM" in Image.registered_extensions().values() - ): - self.env["ir.config_parameter"].set_param( - "document_ocr.synchronous", "True" - ) - data = StringIO() - test_image = Image.new("1", (200, 30)) - test_image.save(data, "Palm") - with mute_logger("openerp.addons.document_ocr.models.ir_attachment"): - result = self.env["ir.attachment"]._index( - data.getvalue(), "test.palm", None - ) - self.assertEqual(result[1], None) diff --git a/setup/attachment_indexation_ocr/odoo/addons/attachment_indexation_ocr b/setup/attachment_indexation_ocr/odoo/addons/attachment_indexation_ocr new file mode 120000 index 00000000..970e69df --- /dev/null +++ b/setup/attachment_indexation_ocr/odoo/addons/attachment_indexation_ocr @@ -0,0 +1 @@ +../../../../attachment_indexation_ocr \ No newline at end of file diff --git a/setup/document_ocr/setup.py b/setup/attachment_indexation_ocr/setup.py similarity index 100% rename from setup/document_ocr/setup.py rename to setup/attachment_indexation_ocr/setup.py diff --git a/setup/document_ocr/odoo/addons/document_ocr b/setup/document_ocr/odoo/addons/document_ocr deleted file mode 120000 index 142c7cf3..00000000 --- a/setup/document_ocr/odoo/addons/document_ocr +++ /dev/null @@ -1 +0,0 @@ -../../../../document_ocr \ No newline at end of file