[ADD] attachment_indexation_textract

This commit is contained in:
len 2024-02-15 11:19:29 +01:00
parent 8d1125aa76
commit 1125d299af
7 changed files with 76 additions and 0 deletions

View File

@ -0,0 +1,5 @@
======================================================
Attachments List and Document Indexation with Textract
======================================================
Module to index documents with Textract; install optional dependencies to support more file formats. Supported formats include .xls, .doc with antiword, etc.

View File

@ -0,0 +1,4 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import models

View File

@ -0,0 +1,20 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{
"name": "Attachments Indexation with Textract",
"category": "Hidden/Tools",
"version": "16.0.1.0.0",
"summary": "Attachments List and Document Indexation with PyMuPDF",
"author": "len-foss/FinancialWay,Odoo Community Association (OCA)",
"website": "https://github.com/OCA/knowledge",
"license": "AGPL-3",
"depends": ["attachment_indexation"],
"auto_install": True,
"installable": True,
"data": [],
"assets": {},
"external_dependencies": {
"python": ["textract"],
"bin": ["antiword"],
},
}

View File

@ -0,0 +1,4 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import ir_attachment

View File

@ -0,0 +1,36 @@
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import logging
import mimetypes
import tempfile
import textract
from odoo import models
_logger = logging.getLogger(__name__)
class IrAttachment(models.Model):
_inherit = "ir.attachment"
def _index(self, bin_data, mimetype, checksum=None):
"""Index documents with textract if available"""
if mimetype != "application/pdf": # mupdf is better
buf = self.with_context(mimetype=mimetype)._index_textract(bin_data)
return buf or super()._index(bin_data, mimetype, checksum=checksum)
def _index_textract(self, bin_data):
"""Index documents with textract if available"""
buf = ""
try:
mimetype = self.env.context.get("mimetype")
extension = mimetypes.guess_extension(mimetype)
with tempfile.NamedTemporaryFile(suffix=extension or "") as tmp_file:
tmp_file.write(bin_data)
file_path = tmp_file.name
buf = textract.process(file_path)
except Exception:
_logger.info(Exception, exc_info=True)
return buf

View File

@ -0,0 +1 @@
../../../../attachment_indexation_textract

View File

@ -0,0 +1,6 @@
import setuptools
setuptools.setup(
setup_requires=['setuptools-odoo'],
odoo_addon=True,
)