mirror of
https://github.com/OCA/knowledge.git
synced 2025-07-27 19:08:42 -06:00
[IMP] attachment_indexation_ocr: option to pass tesseract lang in context
This commit is contained in:
parent
5edbe1685b
commit
73cff87095
@ -59,8 +59,13 @@ class IrAttachment(models.Model):
|
|||||||
except IOError:
|
except IOError:
|
||||||
_logger.exception("Failed to OCR image")
|
_logger.exception("Failed to OCR image")
|
||||||
return None
|
return None
|
||||||
|
tesseract_command = ["tesseract", "stdin", "stdout"]
|
||||||
|
if self.env.context.get("ocr_lang"):
|
||||||
|
# no check that this lang has been correctly installed;
|
||||||
|
# the corresponding tessdata should be listed by `tesseract --list-langs`
|
||||||
|
tesseract_command += ["-l", self.env.context["ocr_lang"]]
|
||||||
process = subprocess.Popen(
|
process = subprocess.Popen(
|
||||||
["tesseract", "stdin", "stdout"],
|
tesseract_command,
|
||||||
stdin=subprocess.PIPE,
|
stdin=subprocess.PIPE,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
@ -89,12 +94,14 @@ class IrAttachment(models.Model):
|
|||||||
recs = self.with_context(ocr_force=True).search(domain, limit=limit)
|
recs = self.with_context(ocr_force=True).search(domain, limit=limit)
|
||||||
recs.perform_ocr()
|
recs.perform_ocr()
|
||||||
|
|
||||||
def perform_ocr(self):
|
def perform_ocr(self, tesseract_lang=None):
|
||||||
for rec in self:
|
for rec in self:
|
||||||
if not rec.datas:
|
if not rec.datas:
|
||||||
index_content = "" # the _MARKER_PHRASE should be removed
|
index_content = "" # the _MARKER_PHRASE should be removed
|
||||||
else:
|
else:
|
||||||
bin_data = base64.b64decode(rec.datas)
|
bin_data = base64.b64decode(rec.datas)
|
||||||
ctx = {"ocr_force": True}
|
ctx = {"ocr_force": True}
|
||||||
|
if tesseract_lang:
|
||||||
|
ctx["ocr_lang"] = tesseract_lang
|
||||||
index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype)
|
index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype)
|
||||||
rec.write({"index_content": index_content})
|
rec.write({"index_content": index_content})
|
||||||
|
@ -63,3 +63,11 @@ class TestDocumentOcr(TransactionCase):
|
|||||||
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
||||||
attachment._ocr_cron()
|
attachment._ocr_cron()
|
||||||
self.assertEqual(attachment.index_content.strip(), result_string)
|
self.assertEqual(attachment.index_content.strip(), result_string)
|
||||||
|
|
||||||
|
def test_document_ocr_lang(self):
|
||||||
|
"""We can pass an ocr_lang context key to help text detection"""
|
||||||
|
self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
|
||||||
|
bin_data = _get_image_data("pdf")
|
||||||
|
with_lang = self.env["ir.attachment"].with_context(ocr_lang="eng")
|
||||||
|
result = with_lang._index(bin_data, "application/pdf")
|
||||||
|
self.assertEqual(result.strip(), result_string)
|
||||||
|
Loading…
Reference in New Issue
Block a user