From 73cff8709593e1f4fac5c20a2fed1add80d50a57 Mon Sep 17 00:00:00 2001 From: len Date: Thu, 7 Sep 2023 11:47:51 +0200 Subject: [PATCH] [IMP] attachment_indexation_ocr: option to pass tesseract lang in context --- attachment_indexation_ocr/models/ir_attachment.py | 11 +++++++++-- attachment_indexation_ocr/tests/test_document_ocr.py | 8 ++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/attachment_indexation_ocr/models/ir_attachment.py b/attachment_indexation_ocr/models/ir_attachment.py index 03f0613b..ad5d85ab 100644 --- a/attachment_indexation_ocr/models/ir_attachment.py +++ b/attachment_indexation_ocr/models/ir_attachment.py @@ -59,8 +59,13 @@ class IrAttachment(models.Model): except IOError: _logger.exception("Failed to OCR image") return None + tesseract_command = ["tesseract", "stdin", "stdout"] + if self.env.context.get("ocr_lang"): + # no check that this lang has been correctly installed; + # the corresponding tessdata should be listed by `tesseract --list-langs` + tesseract_command += ["-l", self.env.context["ocr_lang"]] process = subprocess.Popen( - ["tesseract", "stdin", "stdout"], + tesseract_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -89,12 +94,14 @@ class IrAttachment(models.Model): recs = self.with_context(ocr_force=True).search(domain, limit=limit) recs.perform_ocr() - def perform_ocr(self): + def perform_ocr(self, tesseract_lang=None): for rec in self: if not rec.datas: index_content = "" # the _MARKER_PHRASE should be removed else: bin_data = base64.b64decode(rec.datas) ctx = {"ocr_force": True} + if tesseract_lang: + ctx["ocr_lang"] = tesseract_lang index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype) rec.write({"index_content": index_content}) diff --git a/attachment_indexation_ocr/tests/test_document_ocr.py b/attachment_indexation_ocr/tests/test_document_ocr.py index 70df3d0e..40619c51 100644 --- a/attachment_indexation_ocr/tests/test_document_ocr.py +++ b/attachment_indexation_ocr/tests/test_document_ocr.py @@ -63,3 +63,11 @@ class TestDocumentOcr(TransactionCase): self.assertEqual(attachment.index_content, _MARKER_PHRASE) attachment._ocr_cron() self.assertEqual(attachment.index_content.strip(), result_string) + + def test_document_ocr_lang(self): + """We can pass an ocr_lang context key to help text detection""" + self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True") + bin_data = _get_image_data("pdf") + with_lang = self.env["ir.attachment"].with_context(ocr_lang="eng") + result = with_lang._index(bin_data, "application/pdf") + self.assertEqual(result.strip(), result_string)