[IMP] attachment_indexation_ocr: option to pass tesseract lang in context

This commit is contained in:
len 2023-09-07 11:47:51 +02:00
parent 5edbe1685b
commit 73cff87095
2 changed files with 17 additions and 2 deletions

View File

@ -59,8 +59,13 @@ class IrAttachment(models.Model):
except IOError: except IOError:
_logger.exception("Failed to OCR image") _logger.exception("Failed to OCR image")
return None return None
tesseract_command = ["tesseract", "stdin", "stdout"]
if self.env.context.get("ocr_lang"):
# no check that this lang has been correctly installed;
# the corresponding tessdata should be listed by `tesseract --list-langs`
tesseract_command += ["-l", self.env.context["ocr_lang"]]
process = subprocess.Popen( process = subprocess.Popen(
["tesseract", "stdin", "stdout"], tesseract_command,
stdin=subprocess.PIPE, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
@ -89,12 +94,14 @@ class IrAttachment(models.Model):
recs = self.with_context(ocr_force=True).search(domain, limit=limit) recs = self.with_context(ocr_force=True).search(domain, limit=limit)
recs.perform_ocr() recs.perform_ocr()
def perform_ocr(self): def perform_ocr(self, tesseract_lang=None):
for rec in self: for rec in self:
if not rec.datas: if not rec.datas:
index_content = "" # the _MARKER_PHRASE should be removed index_content = "" # the _MARKER_PHRASE should be removed
else: else:
bin_data = base64.b64decode(rec.datas) bin_data = base64.b64decode(rec.datas)
ctx = {"ocr_force": True} ctx = {"ocr_force": True}
if tesseract_lang:
ctx["ocr_lang"] = tesseract_lang
index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype) index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype)
rec.write({"index_content": index_content}) rec.write({"index_content": index_content})

View File

@ -63,3 +63,11 @@ class TestDocumentOcr(TransactionCase):
self.assertEqual(attachment.index_content, _MARKER_PHRASE) self.assertEqual(attachment.index_content, _MARKER_PHRASE)
attachment._ocr_cron() attachment._ocr_cron()
self.assertEqual(attachment.index_content.strip(), result_string) self.assertEqual(attachment.index_content.strip(), result_string)
def test_document_ocr_lang(self):
"""We can pass an ocr_lang context key to help text detection"""
self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
bin_data = _get_image_data("pdf")
with_lang = self.env["ir.attachment"].with_context(ocr_lang="eng")
result = with_lang._index(bin_data, "application/pdf")
self.assertEqual(result.strip(), result_string)