From 73cff8709593e1f4fac5c20a2fed1add80d50a57 Mon Sep 17 00:00:00 2001
From: len <nans.lefebvre@gmail.com>
Date: Thu, 7 Sep 2023 11:47:51 +0200
Subject: [PATCH] [IMP] attachment_indexation_ocr: option to pass tesseract
 lang in context

---
 attachment_indexation_ocr/models/ir_attachment.py    | 11 +++++++++--
 attachment_indexation_ocr/tests/test_document_ocr.py |  8 ++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/attachment_indexation_ocr/models/ir_attachment.py b/attachment_indexation_ocr/models/ir_attachment.py
index 03f0613b..ad5d85ab 100644
--- a/attachment_indexation_ocr/models/ir_attachment.py
+++ b/attachment_indexation_ocr/models/ir_attachment.py
@@ -59,8 +59,13 @@ class IrAttachment(models.Model):
             except IOError:
                 _logger.exception("Failed to OCR image")
                 return None
+        tesseract_command = ["tesseract", "stdin", "stdout"]
+        if self.env.context.get("ocr_lang"):
+            # no check that this lang has been correctly installed;
+            # the corresponding tessdata should be listed by `tesseract --list-langs`
+            tesseract_command += ["-l", self.env.context["ocr_lang"]]
         process = subprocess.Popen(
-            ["tesseract", "stdin", "stdout"],
+            tesseract_command,
             stdin=subprocess.PIPE,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
@@ -89,12 +94,14 @@ class IrAttachment(models.Model):
         recs = self.with_context(ocr_force=True).search(domain, limit=limit)
         recs.perform_ocr()
 
-    def perform_ocr(self):
+    def perform_ocr(self, tesseract_lang=None):
         for rec in self:
             if not rec.datas:
                 index_content = ""  # the _MARKER_PHRASE should be removed
             else:
                 bin_data = base64.b64decode(rec.datas)
                 ctx = {"ocr_force": True}
+                if tesseract_lang:
+                    ctx["ocr_lang"] = tesseract_lang
                 index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype)
             rec.write({"index_content": index_content})
diff --git a/attachment_indexation_ocr/tests/test_document_ocr.py b/attachment_indexation_ocr/tests/test_document_ocr.py
index 70df3d0e..40619c51 100644
--- a/attachment_indexation_ocr/tests/test_document_ocr.py
+++ b/attachment_indexation_ocr/tests/test_document_ocr.py
@@ -63,3 +63,11 @@ class TestDocumentOcr(TransactionCase):
         self.assertEqual(attachment.index_content, _MARKER_PHRASE)
         attachment._ocr_cron()
         self.assertEqual(attachment.index_content.strip(), result_string)
+
+    def test_document_ocr_lang(self):
+        """We can pass an ocr_lang context key to help text detection"""
+        self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
+        bin_data = _get_image_data("pdf")
+        with_lang = self.env["ir.attachment"].with_context(ocr_lang="eng")
+        result = with_lang._index(bin_data, "application/pdf")
+        self.assertEqual(result.strip(), result_string)