[IMP] attachment_indexation_ocr: convert pdf with fitz

This is more performant and easily split pages to avoid
getting into errors with maximum image size of tessearact.
This commit is contained in:
len 2023-09-07 13:06:15 +02:00
parent 73cff87095
commit 6196f307ee
3 changed files with 28 additions and 26 deletions

View File

@ -1,4 +1,5 @@
# © 2016 Therp BV <http://therp.nl> # © 2016 Therp BV <http://therp.nl>
# Copyright 2023 len-foss/Financial Way
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). # License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{ {
"name": "OCR for documents", "name": "OCR for documents",
@ -13,5 +14,5 @@
"data/ir_cron.xml", "data/ir_cron.xml",
"data/ir_config_parameter.xml", "data/ir_config_parameter.xml",
], ],
"external_dependencies": {"bin": ["tesseract"]}, "external_dependencies": {"bin": ["tesseract"], "python": ["PyMuPDF"]},
} }

View File

@ -6,6 +6,7 @@ import logging
import subprocess import subprocess
from io import BytesIO from io import BytesIO
import fitz
from PIL import Image from PIL import Image
from odoo import api, models from odoo import api, models
@ -45,14 +46,12 @@ class IrAttachment(models.Model):
_logger.warning("Invalid mimetype %s", file_type) _logger.warning("Invalid mimetype %s", file_type)
return None return None
top_type, sub_type = file_type.split("/", 1) top_type, sub_type = file_type.split("/", 1)
images = []
if sub_type == "pdf": if sub_type == "pdf":
# tesseract only supports image of at most 32K pixels images += self._index_ocr_get_data_pdf(bin_data, dpi) # TODO
# depending on the number of pages, we have to either split
# into different batches or reduce the dpi;
# The maximum width and height are 32767.
image_data = self._index_ocr_get_data_pdf(bin_data, dpi) # TODO
else: else:
image_data = BytesIO() image_data = BytesIO()
images.append(image_data)
try: try:
i = Image.open(BytesIO(bin_data)) i = Image.open(BytesIO(bin_data))
i.save(image_data, "png", dpi=(dpi, dpi)) i.save(image_data, "png", dpi=(dpi, dpi))
@ -64,29 +63,29 @@ class IrAttachment(models.Model):
# no check that this lang has been correctly installed; # no check that this lang has been correctly installed;
# the corresponding tessdata should be listed by `tesseract --list-langs` # the corresponding tessdata should be listed by `tesseract --list-langs`
tesseract_command += ["-l", self.env.context["ocr_lang"]] tesseract_command += ["-l", self.env.context["ocr_lang"]]
process = subprocess.Popen( result = ""
tesseract_command, for im in images:
stdin=subprocess.PIPE, process = subprocess.Popen(
stdout=subprocess.PIPE, tesseract_command,
stderr=subprocess.PIPE, stdin=subprocess.PIPE,
) stdout=subprocess.PIPE,
stdout, stderr = process.communicate(image_data.getvalue()) stderr=subprocess.PIPE,
if process.returncode: )
_logger.error("Error during OCR: %s", stderr) stdout, stderr = process.communicate(im.getvalue())
return stdout.decode("utf-8") if process.returncode:
_logger.error("Error during OCR: %s", stderr)
result += stdout.decode("utf-8")
return result
@api.model @api.model
def _index_ocr_get_data_pdf(self, bin_data, dpi): def _index_ocr_get_data_pdf(self, bin_data, dpi):
process = subprocess.Popen( # tesseract only supports image of at most 32K pixels in any dimension
["convert", "-density", str(dpi), "-", "-append", "png32:-"], # it is thus better to have a list of images than a single one
stdin=subprocess.PIPE, res = []
stdout=subprocess.PIPE, for page in fitz.open(stream=bin_data, filetype="pdf"):
stderr=subprocess.PIPE, pix = page.get_pixmap(dpi=dpi, alpha=False)
) res.append(BytesIO(pix.tobytes("png")))
stdout, stderr = process.communicate(bin_data) return res
if stderr:
_logger.error("Error converting to PDF: %s", stderr)
return BytesIO(stdout)
@api.model @api.model
def _ocr_cron(self, limit=None): def _ocr_cron(self, limit=None):

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
# generated from manifests external_dependencies
PyMuPDF