[IMP] attachment_indexation_ocr: convert pdf with fitz

This is more performant and easily split pages to avoid getting into errors with maximum image size of tessearact.
2025-07-27 19:08:42 -06:00 · 2023-09-07 13:06:15 +02:00 · 2023-09-07 13:06:15 +02:00 · 6196f307ee
commit 6196f307ee
parent 73cff87095
3 changed files with 28 additions and 26 deletions
--- a/attachment_indexation_ocr/manifest.py
+++ b/attachment_indexation_ocr/manifest.py
@ -1,4 +1,5 @@
 # © 2016 Therp BV <http://therp.nl>
+# Copyright 2023 len-foss/Financial Way
 # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
 {
    "name": "OCR for documents",
@ -13,5 +14,5 @@
        "data/ir_cron.xml",
        "data/ir_config_parameter.xml",
    ],
-    "external_dependencies": {"bin": ["tesseract"]},
+    "external_dependencies": {"bin": ["tesseract"], "python": ["PyMuPDF"]},
 }
--- a/attachment_indexation_ocr/models/ir_attachment.py
+++ b/attachment_indexation_ocr/models/ir_attachment.py
@ -6,6 +6,7 @@ import logging
 import subprocess
 from io import BytesIO

+import fitz
 from PIL import Image

 from odoo import api, models
@ -45,14 +46,12 @@ class IrAttachment(models.Model):
            _logger.warning("Invalid mimetype %s", file_type)
            return None
        top_type, sub_type = file_type.split("/", 1)
+        images = []
        if sub_type == "pdf":
-            # tesseract only supports image of at most 32K pixels
-            # depending on the number of pages, we have to either split
-            # into different batches or reduce the dpi;
-            # The maximum width and height are 32767.
-            image_data = self._index_ocr_get_data_pdf(bin_data, dpi)  # TODO
+            images += self._index_ocr_get_data_pdf(bin_data, dpi)  # TODO
        else:
            image_data = BytesIO()
+            images.append(image_data)
            try:
                i = Image.open(BytesIO(bin_data))
                i.save(image_data, "png", dpi=(dpi, dpi))
@ -64,29 +63,29 @@ class IrAttachment(models.Model):
            # no check that this lang has been correctly installed;
            # the corresponding tessdata should be listed by `tesseract --list-langs`
            tesseract_command += ["-l", self.env.context["ocr_lang"]]
+        result = ""
+        for im in images:
            process = subprocess.Popen(
                tesseract_command,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
-        stdout, stderr = process.communicate(image_data.getvalue())
+            stdout, stderr = process.communicate(im.getvalue())
            if process.returncode:
                _logger.error("Error during OCR: %s", stderr)
-        return stdout.decode("utf-8")
+            result += stdout.decode("utf-8")
+        return result

    @api.model
    def _index_ocr_get_data_pdf(self, bin_data, dpi):
-        process = subprocess.Popen(
-            ["convert", "-density", str(dpi), "-", "-append", "png32:-"],
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        stdout, stderr = process.communicate(bin_data)
-        if stderr:
-            _logger.error("Error converting to PDF: %s", stderr)
-        return BytesIO(stdout)
+        # tesseract only supports image of at most 32K pixels in any dimension
+        # it is thus better to have a list of images than a single one
+        res = []
+        for page in fitz.open(stream=bin_data, filetype="pdf"):
+            pix = page.get_pixmap(dpi=dpi, alpha=False)
+            res.append(BytesIO(pix.tobytes("png")))
+        return res

    @api.model
    def _ocr_cron(self, limit=None):
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+# generated from manifests external_dependencies
+PyMuPDF