[MIG] document_ocr -> attachment_indexation_ocr

2025-07-27 19:08:42 -06:00 · 2023-09-07 10:34:42 +02:00 · 2023-09-07 10:34:42 +02:00 · 5edbe1685b
commit 5edbe1685b
parent f1f13f1e8b
22 changed files with 755 additions and 263 deletions
--- a/attachment_indexation_ocr/README.rst
+++ b/attachment_indexation_ocr/README.rst
@ -0,0 +1,107 @@
+=================
+OCR for documents
+=================
+
+.. 
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   !! This file is generated by oca-gen-addon-readme !!
+   !! changes will be overwritten.                   !!
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   !! source digest: sha256:488ceb3b031015c08770a769f1357f5dcd462d28eaca37048790a61ef9a5feab
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+.. |badge1| image:: https://img.shields.io/badge/maturity-Beta-yellow.png
+    :target: https://odoo-community.org/page/development-status
+    :alt: Beta
+.. |badge2| image:: https://img.shields.io/badge/licence-AGPL--3-blue.png
+    :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
+    :alt: License: AGPL-3
+.. |badge3| image:: https://img.shields.io/badge/github-OCA%2Fknowledge-lightgray.png?logo=github
+    :target: https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr
+    :alt: OCA/knowledge
+.. |badge4| image:: https://img.shields.io/badge/weblate-Translate%20me-F47D42.png
+    :target: https://translation.odoo-community.org/projects/knowledge-16-0/knowledge-16-0-attachment_indexation_ocr
+    :alt: Translate me on Weblate
+.. |badge5| image:: https://img.shields.io/badge/runboat-Try%20me-875A7B.png
+    :target: https://runboat.odoo-community.org/builds?repo=OCA/knowledge&target_branch=16.0
+    :alt: Try me on Runboat
+
+|badge1| |badge2| |badge3| |badge4| |badge5|
+
+This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
+
+It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
+
+**Table of contents**
+
+.. contents::
+   :local:
+
+Installation
+============
+
+To install this module, you need to:
+
+#. install tesseract and the language(s) your documents use
+#. if you want to support OCR on PDFs, install imagemagick
+#. install the module itself
+
+On an Debian or Ubuntu system you would typically run::
+
+    $ sudo apt-get install tesseract-ocr imagemagick
+
+Configuration
+=============
+
+To configure this module, go to:
+
+#. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*
+
+Usage
+=====
+
+By default, character recognition is done asynchronously by a cronjob at night.
+This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
+The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
+In case you want to force the OCR to be done immediately, set configuration parameter ``ocr.synchronous`` to value ``True``.
+
+Bug Tracker
+===========
+
+Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
+In case of trouble, please check there if your issue has already been reported.
+If you spotted it first, help us to smash it by providing a detailed and welcomed
+`feedback <https://github.com/OCA/knowledge/issues/new?body=module:%20attachment_indexation_ocr%0Aversion:%2016.0%0A%0A**Steps%20to%20reproduce**%0A-%20...%0A%0A**Current%20behavior**%0A%0A**Expected%20behavior**>`_.
+
+Do not contact contributors directly about support or help with technical issues.
+
+Credits
+=======
+
+Authors
+~~~~~~~
+
+* Therp BV
+
+Contributors
+~~~~~~~~~~~~
+
+* Holger Brunn <hbrunn@therp.nl>
+* len-foss <nans.lefebvre@gmail.com>
+
+Maintainers
+~~~~~~~~~~~
+
+This module is maintained by the OCA.
+
+.. image:: https://odoo-community.org/logo.png
+   :alt: Odoo Community Association
+   :target: https://odoo-community.org
+
+OCA, or the Odoo Community Association, is a nonprofit organization whose
+mission is to support the collaborative development of Odoo features and
+promote its widespread use.
+
+This module is part of the `OCA/knowledge <https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr>`_ project on GitHub.
+
+You are welcome to contribute. To learn how please visit https://odoo-community.org/page/Contribute.
--- a/attachment_indexation_ocr/init.py
+++ b/attachment_indexation_ocr/init.py
--- a/attachment_indexation_ocr/manifest.py
+++ b/attachment_indexation_ocr/manifest.py
@ -5,18 +5,13 @@
    "version": "16.0.1.0.0",
    "author": "Therp BV,Odoo Community Association (OCA)",
    "license": "AGPL-3",
+    "website": "https://github.com/OCA/knowledge",
    "category": "Knowledge Management",
    "summary": "Run character recognition on uploaded files",
-    "depends": [
-        "document",
-    ],
+    "depends": ["attachment_indexation"],
    "data": [
        "data/ir_cron.xml",
        "data/ir_config_parameter.xml",
    ],
-    "external_dependencies": {
-        "bin": [
-            "tesseract",
-        ],
-    },
+    "external_dependencies": {"bin": ["tesseract"]},
 }
--- a/attachment_indexation_ocr/data/ir_config_parameter.xml
+++ b/attachment_indexation_ocr/data/ir_config_parameter.xml
@ -1,11 +1,11 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <odoo noupdate="1">
        <record id="param_synchronous" model="ir.config_parameter">
-            <field name="key">document_ocr.synchronous</field>
+            <field name="key">ocr.synchronous</field>
            <field name="value">False</field>
        </record>
        <record id="param_dpi" model="ir.config_parameter">
-            <field name="key">document_ocr.dpi</field>
+            <field name="key">ocr.dpi</field>
            <field name="value">300</field>
        </record>
 </odoo>
--- a/attachment_indexation_ocr/data/ir_cron.xml
+++ b/attachment_indexation_ocr/data/ir_cron.xml
@ -4,9 +4,9 @@
            <field name="name">Run OCR on uploaded documents</field>
            <field name="interval_type">days</field>
            <field name="interval_number">1</field>
-            <field name="model">ir.attachment</field>
-            <field name="function">_ocr_cron</field>
+            <field name="model_id" ref="model_ir_attachment" />
+            <field name="state">code</field>
+            <field name="code">model._ocr_cron(limit=100)</field>
            <field name="numbercall">-1</field>
-            <field name="args">(100,)</field>
        </record>
 </odoo>
--- a/attachment_indexation_ocr/models/init.py
+++ b/attachment_indexation_ocr/models/init.py
--- a/attachment_indexation_ocr/models/ir_attachment.py
+++ b/attachment_indexation_ocr/models/ir_attachment.py
@ -0,0 +1,100 @@
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+import base64
+import logging
+import subprocess
+from io import BytesIO
+
+from PIL import Image
+
+from odoo import api, models
+
+_logger = logging.getLogger(__name__)
+_MARKER_PHRASE = "[[waiting for OCR]]"
+
+
+class IrAttachment(models.Model):
+    _inherit = "ir.attachment"
+
+    @api.model
+    def _get_no_content_strings(self):
+        return ["image", "application"]
+
+    @api.model
+    def _not_content(self, text):
+        return not text or text in self._get_no_content_strings()
+
+    @api.model
+    def _index(self, bin_data, file_type, checksum=None):
+        content = super()._index(bin_data, file_type, checksum)
+        if bin_data and file_type and self._not_content(content):
+            synchronous = self.env["ir.config_parameter"].get_param("ocr.synchronous")
+            if synchronous == "True" or self.env.context.get("ocr_force"):
+                content = self._index_ocr(bin_data, file_type)
+            else:
+                content = _MARKER_PHRASE
+        return content
+
+    @api.model
+    def _index_ocr(self, bin_data, file_type, dpi=0):
+        if not dpi:
+            icp = self.env["ir.config_parameter"]
+            dpi = int(icp.get_param("ocr.dpi", "500"))
+        if "/" not in file_type:
+            _logger.warning("Invalid mimetype %s", file_type)
+            return None
+        top_type, sub_type = file_type.split("/", 1)
+        if sub_type == "pdf":
+            # tesseract only supports image of at most 32K pixels
+            # depending on the number of pages, we have to either split
+            # into different batches or reduce the dpi;
+            # The maximum width and height are 32767.
+            image_data = self._index_ocr_get_data_pdf(bin_data, dpi)  # TODO
+        else:
+            image_data = BytesIO()
+            try:
+                i = Image.open(BytesIO(bin_data))
+                i.save(image_data, "png", dpi=(dpi, dpi))
+            except IOError:
+                _logger.exception("Failed to OCR image")
+                return None
+        process = subprocess.Popen(
+            ["tesseract", "stdin", "stdout"],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = process.communicate(image_data.getvalue())
+        if process.returncode:
+            _logger.error("Error during OCR: %s", stderr)
+        return stdout.decode("utf-8")
+
+    @api.model
+    def _index_ocr_get_data_pdf(self, bin_data, dpi):
+        process = subprocess.Popen(
+            ["convert", "-density", str(dpi), "-", "-append", "png32:-"],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = process.communicate(bin_data)
+        if stderr:
+            _logger.error("Error converting to PDF: %s", stderr)
+        return BytesIO(stdout)
+
+    @api.model
+    def _ocr_cron(self, limit=None):
+        domain = [("index_content", "=", _MARKER_PHRASE)]
+        recs = self.with_context(ocr_force=True).search(domain, limit=limit)
+        recs.perform_ocr()
+
+    def perform_ocr(self):
+        for rec in self:
+            if not rec.datas:
+                index_content = ""  # the _MARKER_PHRASE should be removed
+            else:
+                bin_data = base64.b64decode(rec.datas)
+                ctx = {"ocr_force": True}
+                index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype)
+            rec.write({"index_content": index_content})
--- a/attachment_indexation_ocr/readme/CONFIGURE.rst
+++ b/attachment_indexation_ocr/readme/CONFIGURE.rst
@ -0,0 +1,3 @@
+To configure this module, go to:
+
+#. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*
--- a/attachment_indexation_ocr/readme/CONTRIBUTORS.rst
+++ b/attachment_indexation_ocr/readme/CONTRIBUTORS.rst
@ -0,0 +1,2 @@
+* Holger Brunn <hbrunn@therp.nl>
+* len-foss <nans.lefebvre@gmail.com>
--- a/attachment_indexation_ocr/readme/DESCRIPTION.rst
+++ b/attachment_indexation_ocr/readme/DESCRIPTION.rst
@ -0,0 +1,3 @@
+This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
+
+It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
--- a/attachment_indexation_ocr/readme/INSTALL.rst
+++ b/attachment_indexation_ocr/readme/INSTALL.rst
@ -0,0 +1,9 @@
+To install this module, you need to:
+
+#. install tesseract and the language(s) your documents use
+#. if you want to support OCR on PDFs, install imagemagick
+#. install the module itself
+
+On an Debian or Ubuntu system you would typically run::
+
+    $ sudo apt-get install tesseract-ocr imagemagick
--- a/attachment_indexation_ocr/readme/USAGE.rst
+++ b/attachment_indexation_ocr/readme/USAGE.rst
@ -0,0 +1,4 @@
+By default, character recognition is done asynchronously by a cronjob at night.
+This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
+The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
+In case you want to force the OCR to be done immediately, set configuration parameter ``ocr.synchronous`` to value ``True``.
--- a/attachment_indexation_ocr/static/description/icon.png
+++ b/attachment_indexation_ocr/static/description/icon.png
--- a/attachment_indexation_ocr/static/description/index.html
+++ b/attachment_indexation_ocr/static/description/index.html
@ -0,0 +1,453 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="Docutils: http://docutils.sourceforge.net/" />
+<title>OCR for documents</title>
+<style type="text/css">
+
+/*
+:Author: David Goodger (goodger@python.org)
+:Id: $Id: html4css1.css 7952 2016-07-26 18:15:59Z milde $
+:Copyright: This stylesheet has been placed in the public domain.
+
+Default cascading style sheet for the HTML output of Docutils.
+
+See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
+customize this style sheet.
+*/
+
+/* used to remove borders from tables and images */
+.borderless, table.borderless td, table.borderless th {
+  border: 0 }
+
+table.borderless td, table.borderless th {
+  /* Override padding for "table.docutils td" with "! important".
+     The right padding separates the table cells. */
+  padding: 0 0.5em 0 0 ! important }
+
+.first {
+  /* Override more specific margin styles with "! important". */
+  margin-top: 0 ! important }
+
+.last, .with-subtitle {
+  margin-bottom: 0 ! important }
+
+.hidden {
+  display: none }
+
+.subscript {
+  vertical-align: sub;
+  font-size: smaller }
+
+.superscript {
+  vertical-align: super;
+  font-size: smaller }
+
+a.toc-backref {
+  text-decoration: none ;
+  color: black }
+
+blockquote.epigraph {
+  margin: 2em 5em ; }
+
+dl.docutils dd {
+  margin-bottom: 0.5em }
+
+object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] {
+  overflow: hidden;
+}
+
+/* Uncomment (and remove this text!) to get bold-faced definition list terms
+dl.docutils dt {
+  font-weight: bold }
+*/
+
+div.abstract {
+  margin: 2em 5em }
+
+div.abstract p.topic-title {
+  font-weight: bold ;
+  text-align: center }
+
+div.admonition, div.attention, div.caution, div.danger, div.error,
+div.hint, div.important, div.note, div.tip, div.warning {
+  margin: 2em ;
+  border: medium outset ;
+  padding: 1em }
+
+div.admonition p.admonition-title, div.hint p.admonition-title,
+div.important p.admonition-title, div.note p.admonition-title,
+div.tip p.admonition-title {
+  font-weight: bold ;
+  font-family: sans-serif }
+
+div.attention p.admonition-title, div.caution p.admonition-title,
+div.danger p.admonition-title, div.error p.admonition-title,
+div.warning p.admonition-title, .code .error {
+  color: red ;
+  font-weight: bold ;
+  font-family: sans-serif }
+
+/* Uncomment (and remove this text!) to get reduced vertical space in
+   compound paragraphs.
+div.compound .compound-first, div.compound .compound-middle {
+  margin-bottom: 0.5em }
+
+div.compound .compound-last, div.compound .compound-middle {
+  margin-top: 0.5em }
+*/
+
+div.dedication {
+  margin: 2em 5em ;
+  text-align: center ;
+  font-style: italic }
+
+div.dedication p.topic-title {
+  font-weight: bold ;
+  font-style: normal }
+
+div.figure {
+  margin-left: 2em ;
+  margin-right: 2em }
+
+div.footer, div.header {
+  clear: both;
+  font-size: smaller }
+
+div.line-block {
+  display: block ;
+  margin-top: 1em ;
+  margin-bottom: 1em }
+
+div.line-block div.line-block {
+  margin-top: 0 ;
+  margin-bottom: 0 ;
+  margin-left: 1.5em }
+
+div.sidebar {
+  margin: 0 0 0.5em 1em ;
+  border: medium outset ;
+  padding: 1em ;
+  background-color: #ffffee ;
+  width: 40% ;
+  float: right ;
+  clear: right }
+
+div.sidebar p.rubric {
+  font-family: sans-serif ;
+  font-size: medium }
+
+div.system-messages {
+  margin: 5em }
+
+div.system-messages h1 {
+  color: red }
+
+div.system-message {
+  border: medium outset ;
+  padding: 1em }
+
+div.system-message p.system-message-title {
+  color: red ;
+  font-weight: bold }
+
+div.topic {
+  margin: 2em }
+
+h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
+h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
+  margin-top: 0.4em }
+
+h1.title {
+  text-align: center }
+
+h2.subtitle {
+  text-align: center }
+
+hr.docutils {
+  width: 75% }
+
+img.align-left, .figure.align-left, object.align-left, table.align-left {
+  clear: left ;
+  float: left ;
+  margin-right: 1em }
+
+img.align-right, .figure.align-right, object.align-right, table.align-right {
+  clear: right ;
+  float: right ;
+  margin-left: 1em }
+
+img.align-center, .figure.align-center, object.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+table.align-center {
+  margin-left: auto;
+  margin-right: auto;
+}
+
+.align-left {
+  text-align: left }
+
+.align-center {
+  clear: both ;
+  text-align: center }
+
+.align-right {
+  text-align: right }
+
+/* reset inner alignment in figures */
+div.align-right {
+  text-align: inherit }
+
+/* div.align-center * { */
+/*   text-align: left } */
+
+.align-top    {
+  vertical-align: top }
+
+.align-middle {
+  vertical-align: middle }
+
+.align-bottom {
+  vertical-align: bottom }
+
+ol.simple, ul.simple {
+  margin-bottom: 1em }
+
+ol.arabic {
+  list-style: decimal }
+
+ol.loweralpha {
+  list-style: lower-alpha }
+
+ol.upperalpha {
+  list-style: upper-alpha }
+
+ol.lowerroman {
+  list-style: lower-roman }
+
+ol.upperroman {
+  list-style: upper-roman }
+
+p.attribution {
+  text-align: right ;
+  margin-left: 50% }
+
+p.caption {
+  font-style: italic }
+
+p.credits {
+  font-style: italic ;
+  font-size: smaller }
+
+p.label {
+  white-space: nowrap }
+
+p.rubric {
+  font-weight: bold ;
+  font-size: larger ;
+  color: maroon ;
+  text-align: center }
+
+p.sidebar-title {
+  font-family: sans-serif ;
+  font-weight: bold ;
+  font-size: larger }
+
+p.sidebar-subtitle {
+  font-family: sans-serif ;
+  font-weight: bold }
+
+p.topic-title {
+  font-weight: bold }
+
+pre.address {
+  margin-bottom: 0 ;
+  margin-top: 0 ;
+  font: inherit }
+
+pre.literal-block, pre.doctest-block, pre.math, pre.code {
+  margin-left: 2em ;
+  margin-right: 2em }
+
+pre.code .ln { color: grey; } /* line numbers */
+pre.code, code { background-color: #eeeeee }
+pre.code .comment, code .comment { color: #5C6576 }
+pre.code .keyword, code .keyword { color: #3B0D06; font-weight: bold }
+pre.code .literal.string, code .literal.string { color: #0C5404 }
+pre.code .name.builtin, code .name.builtin { color: #352B84 }
+pre.code .deleted, code .deleted { background-color: #DEB0A1}
+pre.code .inserted, code .inserted { background-color: #A3D289}
+
+span.classifier {
+  font-family: sans-serif ;
+  font-style: oblique }
+
+span.classifier-delimiter {
+  font-family: sans-serif ;
+  font-weight: bold }
+
+span.interpreted {
+  font-family: sans-serif }
+
+span.option {
+  white-space: nowrap }
+
+span.pre {
+  white-space: pre }
+
+span.problematic {
+  color: red }
+
+span.section-subtitle {
+  /* font-size relative to parent (h1..h6 element) */
+  font-size: 80% }
+
+table.citation {
+  border-left: solid 1px gray;
+  margin-left: 1px }
+
+table.docinfo {
+  margin: 2em 4em }
+
+table.docutils {
+  margin-top: 0.5em ;
+  margin-bottom: 0.5em }
+
+table.footnote {
+  border-left: solid 1px black;
+  margin-left: 1px }
+
+table.docutils td, table.docutils th,
+table.docinfo td, table.docinfo th {
+  padding-left: 0.5em ;
+  padding-right: 0.5em ;
+  vertical-align: top }
+
+table.docutils th.field-name, table.docinfo th.docinfo-name {
+  font-weight: bold ;
+  text-align: left ;
+  white-space: nowrap ;
+  padding-left: 0 }
+
+/* "booktabs" style (no vertical lines) */
+table.docutils.booktabs {
+  border: 0px;
+  border-top: 2px solid;
+  border-bottom: 2px solid;
+  border-collapse: collapse;
+}
+table.docutils.booktabs * {
+  border: 0px;
+}
+table.docutils.booktabs th {
+  border-bottom: thin solid;
+  text-align: left;
+}
+
+h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
+h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
+  font-size: 100% }
+
+ul.auto-toc {
+  list-style-type: none }
+
+</style>
+</head>
+<body>
+<div class="document" id="ocr-for-documents">
+<h1 class="title">OCR for documents</h1>
+
+<!-- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!! This file is generated by oca-gen-addon-readme !!
+!! changes will be overwritten.                   !!
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!! source digest: sha256:488ceb3b031015c08770a769f1357f5dcd462d28eaca37048790a61ef9a5feab
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -->
+<p><a class="reference external" href="https://odoo-community.org/page/development-status"><img alt="Beta" src="https://img.shields.io/badge/maturity-Beta-yellow.png" /></a> <a class="reference external" href="http://www.gnu.org/licenses/agpl-3.0-standalone.html"><img alt="License: AGPL-3" src="https://img.shields.io/badge/licence-AGPL--3-blue.png" /></a> <a class="reference external" href="https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr"><img alt="OCA/knowledge" src="https://img.shields.io/badge/github-OCA%2Fknowledge-lightgray.png?logo=github" /></a> <a class="reference external" href="https://translation.odoo-community.org/projects/knowledge-16-0/knowledge-16-0-attachment_indexation_ocr"><img alt="Translate me on Weblate" src="https://img.shields.io/badge/weblate-Translate%20me-F47D42.png" /></a> <a class="reference external" href="https://runboat.odoo-community.org/builds?repo=OCA/knowledge&amp;target_branch=16.0"><img alt="Try me on Runboat" src="https://img.shields.io/badge/runboat-Try%20me-875A7B.png" /></a></p>
+<p>This module was written to make uploaded documents, for example scans, searchable by running OCR on them.</p>
+<p>It supports all image formats <a class="reference external" href="http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html">Pillow supports</a> for reading and PDFs.</p>
+<p><strong>Table of contents</strong></p>
+<div class="contents local topic" id="contents">
+<ul class="simple">
+<li><a class="reference internal" href="#installation" id="id1">Installation</a></li>
+<li><a class="reference internal" href="#configuration" id="id2">Configuration</a></li>
+<li><a class="reference internal" href="#usage" id="id3">Usage</a></li>
+<li><a class="reference internal" href="#bug-tracker" id="id4">Bug Tracker</a></li>
+<li><a class="reference internal" href="#credits" id="id5">Credits</a><ul>
+<li><a class="reference internal" href="#authors" id="id6">Authors</a></li>
+<li><a class="reference internal" href="#contributors" id="id7">Contributors</a></li>
+<li><a class="reference internal" href="#maintainers" id="id8">Maintainers</a></li>
+</ul>
+</li>
+</ul>
+</div>
+<div class="section" id="installation">
+<h1><a class="toc-backref" href="#id1">Installation</a></h1>
+<p>To install this module, you need to:</p>
+<ol class="arabic simple">
+<li>install tesseract and the language(s) your documents use</li>
+<li>if you want to support OCR on PDFs, install imagemagick</li>
+<li>install the module itself</li>
+</ol>
+<p>On an Debian or Ubuntu system you would typically run:</p>
+<pre class="literal-block">
+$ sudo apt-get install tesseract-ocr imagemagick
+</pre>
+</div>
+<div class="section" id="configuration">
+<h1><a class="toc-backref" href="#id2">Configuration</a></h1>
+<p>To configure this module, go to:</p>
+<ol class="arabic simple">
+<li>Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*</li>
+</ol>
+</div>
+<div class="section" id="usage">
+<h1><a class="toc-backref" href="#id3">Usage</a></h1>
+<p>By default, character recognition is done asynchronously by a cronjob at night.
+This is because the recognition process takes a while and you don’t want to make your users wait for the indexation to finish.
+The interval to run the cronjob can be adjusted to your needs in the <tt class="docutils literal">Scheduled Actions</tt> menu, under ` <cite>Settings`</cite>.
+In case you want to force the OCR to be done immediately, set configuration parameter <tt class="docutils literal">ocr.synchronous</tt> to value <tt class="docutils literal">True</tt>.</p>
+</div>
+<div class="section" id="bug-tracker">
+<h1><a class="toc-backref" href="#id4">Bug Tracker</a></h1>
+<p>Bugs are tracked on <a class="reference external" href="https://github.com/OCA/knowledge/issues">GitHub Issues</a>.
+In case of trouble, please check there if your issue has already been reported.
+If you spotted it first, help us to smash it by providing a detailed and welcomed
+<a class="reference external" href="https://github.com/OCA/knowledge/issues/new?body=module:%20attachment_indexation_ocr%0Aversion:%2016.0%0A%0A**Steps%20to%20reproduce**%0A-%20...%0A%0A**Current%20behavior**%0A%0A**Expected%20behavior**">feedback</a>.</p>
+<p>Do not contact contributors directly about support or help with technical issues.</p>
+</div>
+<div class="section" id="credits">
+<h1><a class="toc-backref" href="#id5">Credits</a></h1>
+<div class="section" id="authors">
+<h2><a class="toc-backref" href="#id6">Authors</a></h2>
+<ul class="simple">
+<li>Therp BV</li>
+</ul>
+</div>
+<div class="section" id="contributors">
+<h2><a class="toc-backref" href="#id7">Contributors</a></h2>
+<ul class="simple">
+<li>Holger Brunn &lt;<a class="reference external" href="mailto:hbrunn&#64;therp.nl">hbrunn&#64;therp.nl</a>&gt;</li>
+<li>len-foss &lt;<a class="reference external" href="mailto:nans.lefebvre&#64;gmail.com">nans.lefebvre&#64;gmail.com</a>&gt;</li>
+</ul>
+</div>
+<div class="section" id="maintainers">
+<h2><a class="toc-backref" href="#id8">Maintainers</a></h2>
+<p>This module is maintained by the OCA.</p>
+<a class="reference external image-reference" href="https://odoo-community.org"><img alt="Odoo Community Association" src="https://odoo-community.org/logo.png" /></a>
+<p>OCA, or the Odoo Community Association, is a nonprofit organization whose
+mission is to support the collaborative development of Odoo features and
+promote its widespread use.</p>
+<p>This module is part of the <a class="reference external" href="https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr">OCA/knowledge</a> project on GitHub.</p>
+<p>You are welcome to contribute. To learn how please visit <a class="reference external" href="https://odoo-community.org/page/Contribute">https://odoo-community.org/page/Contribute</a>.</p>
+</div>
+</div>
+</div>
+</body>
+</html>
--- a/attachment_indexation_ocr/tests/init.py
+++ b/attachment_indexation_ocr/tests/init.py
--- a/attachment_indexation_ocr/tests/test_document_ocr.py
+++ b/attachment_indexation_ocr/tests/test_document_ocr.py
@ -0,0 +1,65 @@
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+import base64
+import subprocess
+from io import BytesIO
+
+from PIL import Image, ImageDraw, ImageFont
+
+from odoo.tests.common import TransactionCase
+
+from ..models.ir_attachment import _MARKER_PHRASE
+
+
+def _get_some_system_font():
+    """Get a font that is available on the system"""
+    output = subprocess.check_output(["fc-list"])
+    for line in output.splitlines():
+        line = line.decode("utf-8")
+        if "otf" in line.lower() and "roman" in line.lower():
+            return line.split(":")[0]
+    raise RuntimeError("No suitable font found!")
+
+
+font_path = _get_some_system_font()
+ir_config_parameter_key = "ocr.synchronous"
+result_string = "Hello world"
+
+
+def _get_image_data(frmt="png"):
+    test_image = Image.new("RGB", (200, 30))
+    draw = ImageDraw.Draw(test_image)
+    draw.text((3, 3), result_string, font=ImageFont.truetype(font_path, 24))
+    data = BytesIO()
+    test_image.save(data, frmt)
+    return data.getvalue()
+
+
+class TestDocumentOcr(TransactionCase):
+    def test_document_ocr_png(self):
+        self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
+        bin_data = _get_image_data("png")
+        result = self.env["ir.attachment"]._index(bin_data, "image/png")
+        self.assertEqual(result.strip(), result_string)
+
+    def test_document_ocr_ppm(self):
+        """It works on images that don't have a specific mimetype"""
+        self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
+        bin_data = _get_image_data("ppm")
+        result = self.env["ir.attachment"]._index(bin_data, "application/octet-stream")
+        self.assertEqual(result.strip(), result_string)
+
+    def test_document_ocr_pdf(self):
+        self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
+        bin_data = _get_image_data("pdf")
+        result = self.env["ir.attachment"]._index(bin_data, "application/pdf")
+        self.assertEqual(result.strip(), result_string)
+
+    def test_document_ocr_cron(self):
+        self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "False")
+        bin_data = _get_image_data("png")
+        vals = {"name": "testattachment", "datas": base64.b64encode(bin_data)}
+        attachment = self.env["ir.attachment"].create(vals)
+        self.assertEqual(attachment.index_content, _MARKER_PHRASE)
+        attachment._ocr_cron()
+        self.assertEqual(attachment.index_content.strip(), result_string)
--- a/document_ocr/README.rst
+++ b/document_ocr/README.rst
@ -1,86 +0,0 @@
-.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
-    :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
-    :alt: License: AGPL-3
-
-=================
-OCR for documents
-=================
-
-This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
-
-It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
-
-Installation
-============
-
-To install this module, you need to:
-
-#. install tesseract and the language(s) your documents use
-#. if you want to support OCR on PDFs, install imagemagick
-#. install the module itself
-
-On an Debian or Ubuntu system you would typically run::
-
-    $ sudo apt-get install tesseract-ocr imagemagick
-
-
-Configuration
-=============
-
-To configure this module, go to:
-
-#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.*
-
-Usage
-=====
-
-By default, character recognition is done asynchronously by a cronjob at night. 
-This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
-The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
-In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
-
-.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
-    :alt: Try me on Runbot
-    :target: https://runbot.odoo-community.org/runbot/118/8.0
-
-Bug Tracker
-===========
-
-Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
-In case of trouble, please check there if your issue has already been reported.
-If you spotted it first, help us smashing it by providing a detailed and welcomed feedback.
-
-Credits
-=======
-
-The actual work
---------------
-
-* `tesseract <https://github.com/tesseract-ocr>`_
-
-Images
------
-
-* Odoo Community Association: `Icon <https://github.com/OCA/maintainer-tools/blob/master/template/module/static/description/icon.svg>`_.
-
-Contributors
------------
-
-* Holger Brunn <hbrunn@therp.nl>  
-
-Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list <mailto:community@mail.odoo.com>`_ or the `appropriate specialized mailinglist <https://odoo-community.org/groups>`_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues.
-
-Maintainer
----------
-
-.. image:: https://odoo-community.org/logo.png
-   :alt: Odoo Community Association
-   :target: https://odoo-community.org
-
-This module is maintained by the OCA.
-
-OCA, or the Odoo Community Association, is a nonprofit organization whose
-mission is to support the collaborative development of Odoo features and
-promote its widespread use.
-
-To contribute to this module, please visit https://odoo-community.org.
--- a/document_ocr/models/ir_attachment.py
+++ b/document_ocr/models/ir_attachment.py
@ -1,98 +0,0 @@
-# © 2016 Therp BV <http://therp.nl>
-# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
-import logging
-import subprocess
-
-from PIL import Image
-from StringIO import StringIO
-
-from odoo import api, models
-
-_logger = logging.getLogger(__name__)
-_MARKER_PHRASE = "[[waiting for OCR]]"
-
-
-class IrAttachment(models.Model):
-    _inherit = "ir.attachment"
-
-    @api.model
-    def _index(self, data, datas_fname, file_type):
-        mimetype, content = super(IrAttachment, self)._index(
-            data, datas_fname, file_type
-        )
-        if data and mimetype and (not content or content == "image"):
-            has_synchr_param = (
-                self.env["ir.config_parameter"].get_param(
-                    "document_ocr.synchronous", "False"
-                )
-                == "True"
-            )
-            has_force_flag = self.env.context.get("document_ocr_force")
-            if has_synchr_param or has_force_flag:
-                content = self._index_ocr(mimetype, data, datas_fname, file_type)
-            else:
-                content = _MARKER_PHRASE
-
-        return mimetype, content
-
-    @api.model
-    def _index_ocr(self, mimetype, data, datas_fname, file_type):
-        dpi = int(self.env["ir.config_parameter"].get_param("document_ocr.dpi", "500"))
-        if "/" not in mimetype:
-            _logger.warning("Invalid mimetype %s", mimetype)
-            return None
-        top_type, sub_type = mimetype.split("/", 1)
-        if hasattr(self, "_index_ocr_get_data_%s" % sub_type):
-            image_data = getattr(self, "_index_ocr_get_data_%s" % sub_type)(
-                data, datas_fname, file_type, dpi
-            )
-        else:
-            image_data = StringIO()
-            try:
-                Image.open(StringIO(data)).save(image_data, "png", dpi=(dpi, dpi))
-            except IOError:
-                _logger.exception("Failed to OCR image")
-                return None
-        process = subprocess.Popen(
-            ["tesseract", "stdin", "stdout"],
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        stdout, stderr = process.communicate(image_data.getvalue())
-        if process.returncode:
-            _logger.error("Error during OCR: %s", stderr)
-        return stdout
-
-    @api.model
-    def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
-        process = subprocess.Popen(
-            ["convert", "-density", str(dpi), "-", "-append", "png32:-"],
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        stdout, stderr = process.communicate(data)
-        if stderr:
-            _logger.error("Error converting to PDF: %s", stderr)
-        return StringIO(stdout)
-
-    @api.model
-    def _ocr_cron(self, limit=0):
-        for this in self.with_context(document_ocr_force=True).search(
-            [
-                ("index_content", "=", _MARKER_PHRASE),
-            ],
-            limit=limit,
-        ):
-            if not this.datas:
-                continue
-            file_type, index_content = this._index(
-                this.datas.decode("base64"), this.datas_fname, this.file_type
-            )
-            this.write(
-                {
-                    "file_type": file_type,
-                    "index_content": index_content,
-                }
-            )
--- a/document_ocr/tests/test_document_ocr.py
+++ b/document_ocr/tests/test_document_ocr.py
@ -1,65 +0,0 @@
-# © 2016 Therp BV <http://therp.nl>
-# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
-from PIL import Image, ImageDraw, ImageFont
-from StringIO import StringIO
-
-from odoo.tests.common import TransactionCase
-from odoo.tools.misc import mute_logger
-
-from ..models.ir_attachment import _MARKER_PHRASE
-
-
-class TestDocumentOcr(TransactionCase):
-    def test_document_ocr(self):
-        self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "True")
-        test_image = Image.new("RGB", (200, 30))
-        draw = ImageDraw.Draw(test_image)
-        draw.text(
-            (3, 3),
-            "Hello world",
-            font=ImageFont.truetype(
-                "/usr/share/fonts/truetype/inconsolata/Inconsolata.otf", 24
-            ),
-        )
-        # test a plain image
-        data = StringIO()
-        test_image.save(data, "png")
-        result = self.env["ir.attachment"]._index(data.getvalue(), "test.png", None)
-        self.assertEqual(result[1].strip(), "Hello world")
-        # should also work for pdfs if supported, protect against
-        # ancient pillows
-        if (
-            hasattr(Image, "registered_extensions")
-            and "PDF" in Image.registered_extensions().values()
-        ):
-            data = StringIO()
-            test_image.save(data, "pdf", resolution=300)
-            result = self.env["ir.attachment"]._index(data.getvalue(), "test.pdf", None)
-            self.assertEqual(result[1].strip(), "Hello world")
-        # check cron
-        self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "False")
-        attachment = self.env["ir.attachment"].create(
-            {
-                "name": "testattachment",
-                "datas": data.getvalue().encode("base64"),
-            }
-        )
-        self.assertEqual(attachment.index_content, _MARKER_PHRASE)
-        attachment._ocr_cron()
-        self.assertEqual(attachment.index_content.strip(), "Hello world")
-        # and for an unreadable image, we expect an error
-        if (
-            hasattr(Image, "registered_extensions")
-            and "PALM" in Image.registered_extensions().values()
-        ):
-            self.env["ir.config_parameter"].set_param(
-                "document_ocr.synchronous", "True"
-            )
-            data = StringIO()
-            test_image = Image.new("1", (200, 30))
-            test_image.save(data, "Palm")
-            with mute_logger("openerp.addons.document_ocr.models.ir_attachment"):
-                result = self.env["ir.attachment"]._index(
-                    data.getvalue(), "test.palm", None
-                )
-            self.assertEqual(result[1], None)
--- a/setup/attachment_indexation_ocr/odoo/addons/attachment_indexation_ocr
+++ b/setup/attachment_indexation_ocr/odoo/addons/attachment_indexation_ocr
@ -0,0 +1 @@
+../../../../attachment_indexation_ocr
--- a/setup/attachment_indexation_ocr/setup.py
+++ b/setup/attachment_indexation_ocr/setup.py
--- a/setup/document_ocr/odoo/addons/document_ocr
+++ b/setup/document_ocr/odoo/addons/document_ocr
@ -1 +0,0 @@
-../../../../document_ocr