From 6196f307ee49cb9c75bd3976b8830893a46a5416 Mon Sep 17 00:00:00 2001 From: len Date: Thu, 7 Sep 2023 13:06:15 +0200 Subject: [PATCH] [IMP] attachment_indexation_ocr: convert pdf with fitz This is more performant and easily split pages to avoid getting into errors with maximum image size of tessearact. --- attachment_indexation_ocr/__manifest__.py | 3 +- .../models/ir_attachment.py | 49 +++++++++---------- requirements.txt | 2 + 3 files changed, 28 insertions(+), 26 deletions(-) create mode 100644 requirements.txt diff --git a/attachment_indexation_ocr/__manifest__.py b/attachment_indexation_ocr/__manifest__.py index 3c2f0e0a..aec78bea 100644 --- a/attachment_indexation_ocr/__manifest__.py +++ b/attachment_indexation_ocr/__manifest__.py @@ -1,4 +1,5 @@ # © 2016 Therp BV +# Copyright 2023 len-foss/Financial Way # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). { "name": "OCR for documents", @@ -13,5 +14,5 @@ "data/ir_cron.xml", "data/ir_config_parameter.xml", ], - "external_dependencies": {"bin": ["tesseract"]}, + "external_dependencies": {"bin": ["tesseract"], "python": ["PyMuPDF"]}, } diff --git a/attachment_indexation_ocr/models/ir_attachment.py b/attachment_indexation_ocr/models/ir_attachment.py index ad5d85ab..fce01053 100644 --- a/attachment_indexation_ocr/models/ir_attachment.py +++ b/attachment_indexation_ocr/models/ir_attachment.py @@ -6,6 +6,7 @@ import logging import subprocess from io import BytesIO +import fitz from PIL import Image from odoo import api, models @@ -45,14 +46,12 @@ class IrAttachment(models.Model): _logger.warning("Invalid mimetype %s", file_type) return None top_type, sub_type = file_type.split("/", 1) + images = [] if sub_type == "pdf": - # tesseract only supports image of at most 32K pixels - # depending on the number of pages, we have to either split - # into different batches or reduce the dpi; - # The maximum width and height are 32767. - image_data = self._index_ocr_get_data_pdf(bin_data, dpi) # TODO + images += self._index_ocr_get_data_pdf(bin_data, dpi) # TODO else: image_data = BytesIO() + images.append(image_data) try: i = Image.open(BytesIO(bin_data)) i.save(image_data, "png", dpi=(dpi, dpi)) @@ -64,29 +63,29 @@ class IrAttachment(models.Model): # no check that this lang has been correctly installed; # the corresponding tessdata should be listed by `tesseract --list-langs` tesseract_command += ["-l", self.env.context["ocr_lang"]] - process = subprocess.Popen( - tesseract_command, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate(image_data.getvalue()) - if process.returncode: - _logger.error("Error during OCR: %s", stderr) - return stdout.decode("utf-8") + result = "" + for im in images: + process = subprocess.Popen( + tesseract_command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate(im.getvalue()) + if process.returncode: + _logger.error("Error during OCR: %s", stderr) + result += stdout.decode("utf-8") + return result @api.model def _index_ocr_get_data_pdf(self, bin_data, dpi): - process = subprocess.Popen( - ["convert", "-density", str(dpi), "-", "-append", "png32:-"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate(bin_data) - if stderr: - _logger.error("Error converting to PDF: %s", stderr) - return BytesIO(stdout) + # tesseract only supports image of at most 32K pixels in any dimension + # it is thus better to have a list of images than a single one + res = [] + for page in fitz.open(stream=bin_data, filetype="pdf"): + pix = page.get_pixmap(dpi=dpi, alpha=False) + res.append(BytesIO(pix.tobytes("png"))) + return res @api.model def _ocr_cron(self, limit=None): diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..0e9fc8cd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +# generated from manifests external_dependencies +PyMuPDF