diff --git a/attachment_indexation_mupdf/README.rst b/attachment_indexation_mupdf/README.rst new file mode 100644 index 00000000..0529a51d --- /dev/null +++ b/attachment_indexation_mupdf/README.rst @@ -0,0 +1,5 @@ +===================================================== +Attachments List and Document Indexation with PyMuPDF +===================================================== + +Module to index pdf document using state-of-the-art library. diff --git a/attachment_indexation_mupdf/__init__.py b/attachment_indexation_mupdf/__init__.py new file mode 100644 index 00000000..ada0d667 --- /dev/null +++ b/attachment_indexation_mupdf/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +from . import models diff --git a/attachment_indexation_mupdf/__manifest__.py b/attachment_indexation_mupdf/__manifest__.py new file mode 100644 index 00000000..a8986e95 --- /dev/null +++ b/attachment_indexation_mupdf/__manifest__.py @@ -0,0 +1,17 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +{ + "name": "Attachments List and Document Indexation with PyMuPDF", + "category": "Hidden/Tools", + "version": "16.0.0.0.0", + "summary": "Attachments List and Document Indexation with PyMuPDF", + "author": "len-foss/FinancialWay,Odoo Community Association (OCA)", + "website": "https://github.com/OCA/knowledge", + "license": "AGPL-3", + "depends": ["attachment_indexation"], + "auto_install": True, + "installable": True, + "data": [], + "assets": {}, + "external_dependencies": {"python": ["PyMuPDF"]}, +} diff --git a/attachment_indexation_mupdf/models/__init__.py b/attachment_indexation_mupdf/models/__init__.py new file mode 100644 index 00000000..f407ef53 --- /dev/null +++ b/attachment_indexation_mupdf/models/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +from . import ir_attachment diff --git a/attachment_indexation_mupdf/models/ir_attachment.py b/attachment_indexation_mupdf/models/ir_attachment.py new file mode 100644 index 00000000..8d7fe7cc --- /dev/null +++ b/attachment_indexation_mupdf/models/ir_attachment.py @@ -0,0 +1,36 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +import io +import logging + +from odoo import models + +_logger = logging.getLogger(__name__) + +try: + import fitz +except ImportError: + fitz = None + _logger.warning( + "Attachment indexation of PDF documents is unavailable" + "because PyMuPDF cannot be loaded." + ) + + +class IrAttachment(models.Model): + _inherit = "ir.attachment" + + def _index_pdf(self, bin_data): + """Index PDF documents with MuPDF if available""" + if fitz is None: + return super()._index_pdf(bin_data) + buf = "" + try: + f = io.BytesIO(bin_data) + doc = fitz.open(stream=f, filetype="pdf") + for page in doc: + buf += page.get_text() + except Exception: # pylint: disable=except-pass + pass + return buf diff --git a/attachment_indexation_mupdf/tests/__init__.py b/attachment_indexation_mupdf/tests/__init__.py new file mode 100644 index 00000000..377a63ee --- /dev/null +++ b/attachment_indexation_mupdf/tests/__init__.py @@ -0,0 +1,2 @@ +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). +from . import test_indexation diff --git a/attachment_indexation_mupdf/tests/files/test_content.pdf b/attachment_indexation_mupdf/tests/files/test_content.pdf new file mode 100644 index 00000000..062e1e6e Binary files /dev/null and b/attachment_indexation_mupdf/tests/files/test_content.pdf differ diff --git a/attachment_indexation_mupdf/tests/test_indexation.py b/attachment_indexation_mupdf/tests/test_indexation.py new file mode 100644 index 00000000..90b61802 --- /dev/null +++ b/attachment_indexation_mupdf/tests/test_indexation.py @@ -0,0 +1,27 @@ +# Copyright 2023 len-foss/Financial Way +# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). + +import os +from unittest import skipIf + +from odoo.tests.common import TransactionCase, tagged + +directory = os.path.dirname(__file__) + +try: + import fitz +except ImportError: + fitz = None + + +@tagged("post_install", "-at_install") +class TestCaseIndexation(TransactionCase): + @skipIf(fitz is None, "PyMyPDF is not installed") + def test_attachment_pdf_indexation(self): + with open(os.path.join(directory, "files", "test_content.pdf"), "rb") as file: + pdf = file.read() + text = self.env["ir.attachment"]._index(pdf, "application/pdf") + # note that the whitespace character is not the same as with pdfminer + self.assertEqual( + text, "TestContent!!\n", "the index content should be correct" + ) diff --git a/setup/attachment_indexation_mupdf/odoo/addons/attachment_indexation_mupdf b/setup/attachment_indexation_mupdf/odoo/addons/attachment_indexation_mupdf new file mode 120000 index 00000000..0ba2d648 --- /dev/null +++ b/setup/attachment_indexation_mupdf/odoo/addons/attachment_indexation_mupdf @@ -0,0 +1 @@ +../../../../attachment_indexation_mupdf \ No newline at end of file diff --git a/setup/attachment_indexation_mupdf/setup.py b/setup/attachment_indexation_mupdf/setup.py new file mode 100644 index 00000000..28c57bb6 --- /dev/null +++ b/setup/attachment_indexation_mupdf/setup.py @@ -0,0 +1,6 @@ +import setuptools + +setuptools.setup( + setup_requires=['setuptools-odoo'], + odoo_addon=True, +)