mirror of
https://github.com/OCA/knowledge.git
synced 2025-07-27 19:08:42 -06:00
[IMP] attachment_indexation_ocr: convert pdf with fitz
This is more performant and easily split pages to avoid getting into errors with maximum image size of tessearact.
This commit is contained in:
parent
73cff87095
commit
6196f307ee
@ -1,4 +1,5 @@
|
|||||||
# © 2016 Therp BV <http://therp.nl>
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# Copyright 2023 len-foss/Financial Way
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
{
|
{
|
||||||
"name": "OCR for documents",
|
"name": "OCR for documents",
|
||||||
@ -13,5 +14,5 @@
|
|||||||
"data/ir_cron.xml",
|
"data/ir_cron.xml",
|
||||||
"data/ir_config_parameter.xml",
|
"data/ir_config_parameter.xml",
|
||||||
],
|
],
|
||||||
"external_dependencies": {"bin": ["tesseract"]},
|
"external_dependencies": {"bin": ["tesseract"], "python": ["PyMuPDF"]},
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ import logging
|
|||||||
import subprocess
|
import subprocess
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
import fitz
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from odoo import api, models
|
from odoo import api, models
|
||||||
@ -45,14 +46,12 @@ class IrAttachment(models.Model):
|
|||||||
_logger.warning("Invalid mimetype %s", file_type)
|
_logger.warning("Invalid mimetype %s", file_type)
|
||||||
return None
|
return None
|
||||||
top_type, sub_type = file_type.split("/", 1)
|
top_type, sub_type = file_type.split("/", 1)
|
||||||
|
images = []
|
||||||
if sub_type == "pdf":
|
if sub_type == "pdf":
|
||||||
# tesseract only supports image of at most 32K pixels
|
images += self._index_ocr_get_data_pdf(bin_data, dpi) # TODO
|
||||||
# depending on the number of pages, we have to either split
|
|
||||||
# into different batches or reduce the dpi;
|
|
||||||
# The maximum width and height are 32767.
|
|
||||||
image_data = self._index_ocr_get_data_pdf(bin_data, dpi) # TODO
|
|
||||||
else:
|
else:
|
||||||
image_data = BytesIO()
|
image_data = BytesIO()
|
||||||
|
images.append(image_data)
|
||||||
try:
|
try:
|
||||||
i = Image.open(BytesIO(bin_data))
|
i = Image.open(BytesIO(bin_data))
|
||||||
i.save(image_data, "png", dpi=(dpi, dpi))
|
i.save(image_data, "png", dpi=(dpi, dpi))
|
||||||
@ -64,29 +63,29 @@ class IrAttachment(models.Model):
|
|||||||
# no check that this lang has been correctly installed;
|
# no check that this lang has been correctly installed;
|
||||||
# the corresponding tessdata should be listed by `tesseract --list-langs`
|
# the corresponding tessdata should be listed by `tesseract --list-langs`
|
||||||
tesseract_command += ["-l", self.env.context["ocr_lang"]]
|
tesseract_command += ["-l", self.env.context["ocr_lang"]]
|
||||||
process = subprocess.Popen(
|
result = ""
|
||||||
tesseract_command,
|
for im in images:
|
||||||
stdin=subprocess.PIPE,
|
process = subprocess.Popen(
|
||||||
stdout=subprocess.PIPE,
|
tesseract_command,
|
||||||
stderr=subprocess.PIPE,
|
stdin=subprocess.PIPE,
|
||||||
)
|
stdout=subprocess.PIPE,
|
||||||
stdout, stderr = process.communicate(image_data.getvalue())
|
stderr=subprocess.PIPE,
|
||||||
if process.returncode:
|
)
|
||||||
_logger.error("Error during OCR: %s", stderr)
|
stdout, stderr = process.communicate(im.getvalue())
|
||||||
return stdout.decode("utf-8")
|
if process.returncode:
|
||||||
|
_logger.error("Error during OCR: %s", stderr)
|
||||||
|
result += stdout.decode("utf-8")
|
||||||
|
return result
|
||||||
|
|
||||||
@api.model
|
@api.model
|
||||||
def _index_ocr_get_data_pdf(self, bin_data, dpi):
|
def _index_ocr_get_data_pdf(self, bin_data, dpi):
|
||||||
process = subprocess.Popen(
|
# tesseract only supports image of at most 32K pixels in any dimension
|
||||||
["convert", "-density", str(dpi), "-", "-append", "png32:-"],
|
# it is thus better to have a list of images than a single one
|
||||||
stdin=subprocess.PIPE,
|
res = []
|
||||||
stdout=subprocess.PIPE,
|
for page in fitz.open(stream=bin_data, filetype="pdf"):
|
||||||
stderr=subprocess.PIPE,
|
pix = page.get_pixmap(dpi=dpi, alpha=False)
|
||||||
)
|
res.append(BytesIO(pix.tobytes("png")))
|
||||||
stdout, stderr = process.communicate(bin_data)
|
return res
|
||||||
if stderr:
|
|
||||||
_logger.error("Error converting to PDF: %s", stderr)
|
|
||||||
return BytesIO(stdout)
|
|
||||||
|
|
||||||
@api.model
|
@api.model
|
||||||
def _ocr_cron(self, limit=None):
|
def _ocr_cron(self, limit=None):
|
||||||
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# generated from manifests external_dependencies
|
||||||
|
PyMuPDF
|
Loading…
Reference in New Issue
Block a user