[ADD] attachment_indexation_mupdf

This commit is contained in:
len 2023-09-08 17:13:43 +02:00
parent f31245a448
commit 7f394be9c5
10 changed files with 102 additions and 0 deletions

View File

@ -0,0 +1,5 @@
=====================================================
Attachments List and Document Indexation with PyMuPDF
=====================================================
Module to index pdf document using state-of-the-art library.

View File

@ -0,0 +1,4 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import models

View File

@ -0,0 +1,17 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{
"name": "Attachments List and Document Indexation with PyMuPDF",
"category": "Hidden/Tools",
"version": "16.0.0.0.0",
"summary": "Attachments List and Document Indexation with PyMuPDF",
"author": "len-foss/FinancialWay,Odoo Community Association (OCA)",
"website": "https://github.com/OCA/knowledge",
"license": "AGPL-3",
"depends": ["attachment_indexation"],
"auto_install": True,
"installable": True,
"data": [],
"assets": {},
"external_dependencies": {"python": ["PyMuPDF"]},
}

View File

@ -0,0 +1,4 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import ir_attachment

View File

@ -0,0 +1,36 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import io
import logging
from odoo import models
_logger = logging.getLogger(__name__)
try:
import fitz
except ImportError:
fitz = None
_logger.warning(
"Attachment indexation of PDF documents is unavailable"
"because PyMuPDF cannot be loaded."
)
class IrAttachment(models.Model):
_inherit = "ir.attachment"
def _index_pdf(self, bin_data):
"""Index PDF documents with MuPDF if available"""
if fitz is None:
return super()._index_pdf(bin_data)
buf = ""
try:
f = io.BytesIO(bin_data)
doc = fitz.open(stream=f, filetype="pdf")
for page in doc:
buf += page.get_text()
except Exception: # pylint: disable=except-pass
pass
return buf

View File

@ -0,0 +1,2 @@
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import test_indexation

View File

@ -0,0 +1,27 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import os
from unittest import skipIf
from odoo.tests.common import TransactionCase, tagged
directory = os.path.dirname(__file__)
try:
import fitz
except ImportError:
fitz = None
@tagged("post_install", "-at_install")
class TestCaseIndexation(TransactionCase):
@skipIf(fitz is None, "PyMyPDF is not installed")
def test_attachment_pdf_indexation(self):
with open(os.path.join(directory, "files", "test_content.pdf"), "rb") as file:
pdf = file.read()
text = self.env["ir.attachment"]._index(pdf, "application/pdf")
# note that the whitespace character is not the same as with pdfminer
self.assertEqual(
text, "TestContent!!\n", "the index content should be correct"
)

View File

@ -0,0 +1 @@
../../../../attachment_indexation_mupdf

View File

@ -0,0 +1,6 @@
import setuptools
setuptools.setup(
setup_requires=['setuptools-odoo'],
odoo_addon=True,
)