mirror of
https://github.com/OCA/knowledge.git
synced 2025-07-27 19:08:42 -06:00
[MIG] document_ocr -> attachment_indexation_ocr
This commit is contained in:
parent
f1f13f1e8b
commit
5edbe1685b
107
attachment_indexation_ocr/README.rst
Normal file
107
attachment_indexation_ocr/README.rst
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
=================
|
||||||
|
OCR for documents
|
||||||
|
=================
|
||||||
|
|
||||||
|
..
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
!! This file is generated by oca-gen-addon-readme !!
|
||||||
|
!! changes will be overwritten. !!
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
!! source digest: sha256:488ceb3b031015c08770a769f1357f5dcd462d28eaca37048790a61ef9a5feab
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
|
||||||
|
.. |badge1| image:: https://img.shields.io/badge/maturity-Beta-yellow.png
|
||||||
|
:target: https://odoo-community.org/page/development-status
|
||||||
|
:alt: Beta
|
||||||
|
.. |badge2| image:: https://img.shields.io/badge/licence-AGPL--3-blue.png
|
||||||
|
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
|
||||||
|
:alt: License: AGPL-3
|
||||||
|
.. |badge3| image:: https://img.shields.io/badge/github-OCA%2Fknowledge-lightgray.png?logo=github
|
||||||
|
:target: https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr
|
||||||
|
:alt: OCA/knowledge
|
||||||
|
.. |badge4| image:: https://img.shields.io/badge/weblate-Translate%20me-F47D42.png
|
||||||
|
:target: https://translation.odoo-community.org/projects/knowledge-16-0/knowledge-16-0-attachment_indexation_ocr
|
||||||
|
:alt: Translate me on Weblate
|
||||||
|
.. |badge5| image:: https://img.shields.io/badge/runboat-Try%20me-875A7B.png
|
||||||
|
:target: https://runboat.odoo-community.org/builds?repo=OCA/knowledge&target_branch=16.0
|
||||||
|
:alt: Try me on Runboat
|
||||||
|
|
||||||
|
|badge1| |badge2| |badge3| |badge4| |badge5|
|
||||||
|
|
||||||
|
This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
|
||||||
|
|
||||||
|
It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
|
||||||
|
|
||||||
|
**Table of contents**
|
||||||
|
|
||||||
|
.. contents::
|
||||||
|
:local:
|
||||||
|
|
||||||
|
Installation
|
||||||
|
============
|
||||||
|
|
||||||
|
To install this module, you need to:
|
||||||
|
|
||||||
|
#. install tesseract and the language(s) your documents use
|
||||||
|
#. if you want to support OCR on PDFs, install imagemagick
|
||||||
|
#. install the module itself
|
||||||
|
|
||||||
|
On an Debian or Ubuntu system you would typically run::
|
||||||
|
|
||||||
|
$ sudo apt-get install tesseract-ocr imagemagick
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
To configure this module, go to:
|
||||||
|
|
||||||
|
#. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*
|
||||||
|
|
||||||
|
Usage
|
||||||
|
=====
|
||||||
|
|
||||||
|
By default, character recognition is done asynchronously by a cronjob at night.
|
||||||
|
This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
|
||||||
|
The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
|
||||||
|
In case you want to force the OCR to be done immediately, set configuration parameter ``ocr.synchronous`` to value ``True``.
|
||||||
|
|
||||||
|
Bug Tracker
|
||||||
|
===========
|
||||||
|
|
||||||
|
Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
|
||||||
|
In case of trouble, please check there if your issue has already been reported.
|
||||||
|
If you spotted it first, help us to smash it by providing a detailed and welcomed
|
||||||
|
`feedback <https://github.com/OCA/knowledge/issues/new?body=module:%20attachment_indexation_ocr%0Aversion:%2016.0%0A%0A**Steps%20to%20reproduce**%0A-%20...%0A%0A**Current%20behavior**%0A%0A**Expected%20behavior**>`_.
|
||||||
|
|
||||||
|
Do not contact contributors directly about support or help with technical issues.
|
||||||
|
|
||||||
|
Credits
|
||||||
|
=======
|
||||||
|
|
||||||
|
Authors
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
* Therp BV
|
||||||
|
|
||||||
|
Contributors
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Holger Brunn <hbrunn@therp.nl>
|
||||||
|
* len-foss <nans.lefebvre@gmail.com>
|
||||||
|
|
||||||
|
Maintainers
|
||||||
|
~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module is maintained by the OCA.
|
||||||
|
|
||||||
|
.. image:: https://odoo-community.org/logo.png
|
||||||
|
:alt: Odoo Community Association
|
||||||
|
:target: https://odoo-community.org
|
||||||
|
|
||||||
|
OCA, or the Odoo Community Association, is a nonprofit organization whose
|
||||||
|
mission is to support the collaborative development of Odoo features and
|
||||||
|
promote its widespread use.
|
||||||
|
|
||||||
|
This module is part of the `OCA/knowledge <https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr>`_ project on GitHub.
|
||||||
|
|
||||||
|
You are welcome to contribute. To learn how please visit https://odoo-community.org/page/Contribute.
|
@ -5,18 +5,13 @@
|
|||||||
"version": "16.0.1.0.0",
|
"version": "16.0.1.0.0",
|
||||||
"author": "Therp BV,Odoo Community Association (OCA)",
|
"author": "Therp BV,Odoo Community Association (OCA)",
|
||||||
"license": "AGPL-3",
|
"license": "AGPL-3",
|
||||||
|
"website": "https://github.com/OCA/knowledge",
|
||||||
"category": "Knowledge Management",
|
"category": "Knowledge Management",
|
||||||
"summary": "Run character recognition on uploaded files",
|
"summary": "Run character recognition on uploaded files",
|
||||||
"depends": [
|
"depends": ["attachment_indexation"],
|
||||||
"document",
|
|
||||||
],
|
|
||||||
"data": [
|
"data": [
|
||||||
"data/ir_cron.xml",
|
"data/ir_cron.xml",
|
||||||
"data/ir_config_parameter.xml",
|
"data/ir_config_parameter.xml",
|
||||||
],
|
],
|
||||||
"external_dependencies": {
|
"external_dependencies": {"bin": ["tesseract"]},
|
||||||
"bin": [
|
|
||||||
"tesseract",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
}
|
@ -1,11 +1,11 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8" ?>
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
<odoo noupdate="1">
|
<odoo noupdate="1">
|
||||||
<record id="param_synchronous" model="ir.config_parameter">
|
<record id="param_synchronous" model="ir.config_parameter">
|
||||||
<field name="key">document_ocr.synchronous</field>
|
<field name="key">ocr.synchronous</field>
|
||||||
<field name="value">False</field>
|
<field name="value">False</field>
|
||||||
</record>
|
</record>
|
||||||
<record id="param_dpi" model="ir.config_parameter">
|
<record id="param_dpi" model="ir.config_parameter">
|
||||||
<field name="key">document_ocr.dpi</field>
|
<field name="key">ocr.dpi</field>
|
||||||
<field name="value">300</field>
|
<field name="value">300</field>
|
||||||
</record>
|
</record>
|
||||||
</odoo>
|
</odoo>
|
@ -4,9 +4,9 @@
|
|||||||
<field name="name">Run OCR on uploaded documents</field>
|
<field name="name">Run OCR on uploaded documents</field>
|
||||||
<field name="interval_type">days</field>
|
<field name="interval_type">days</field>
|
||||||
<field name="interval_number">1</field>
|
<field name="interval_number">1</field>
|
||||||
<field name="model">ir.attachment</field>
|
<field name="model_id" ref="model_ir_attachment" />
|
||||||
<field name="function">_ocr_cron</field>
|
<field name="state">code</field>
|
||||||
|
<field name="code">model._ocr_cron(limit=100)</field>
|
||||||
<field name="numbercall">-1</field>
|
<field name="numbercall">-1</field>
|
||||||
<field name="args">(100,)</field>
|
|
||||||
</record>
|
</record>
|
||||||
</odoo>
|
</odoo>
|
100
attachment_indexation_ocr/models/ir_attachment.py
Normal file
100
attachment_indexation_ocr/models/ir_attachment.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from odoo import api, models
|
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__)
|
||||||
|
_MARKER_PHRASE = "[[waiting for OCR]]"
|
||||||
|
|
||||||
|
|
||||||
|
class IrAttachment(models.Model):
|
||||||
|
_inherit = "ir.attachment"
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _get_no_content_strings(self):
|
||||||
|
return ["image", "application"]
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _not_content(self, text):
|
||||||
|
return not text or text in self._get_no_content_strings()
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _index(self, bin_data, file_type, checksum=None):
|
||||||
|
content = super()._index(bin_data, file_type, checksum)
|
||||||
|
if bin_data and file_type and self._not_content(content):
|
||||||
|
synchronous = self.env["ir.config_parameter"].get_param("ocr.synchronous")
|
||||||
|
if synchronous == "True" or self.env.context.get("ocr_force"):
|
||||||
|
content = self._index_ocr(bin_data, file_type)
|
||||||
|
else:
|
||||||
|
content = _MARKER_PHRASE
|
||||||
|
return content
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _index_ocr(self, bin_data, file_type, dpi=0):
|
||||||
|
if not dpi:
|
||||||
|
icp = self.env["ir.config_parameter"]
|
||||||
|
dpi = int(icp.get_param("ocr.dpi", "500"))
|
||||||
|
if "/" not in file_type:
|
||||||
|
_logger.warning("Invalid mimetype %s", file_type)
|
||||||
|
return None
|
||||||
|
top_type, sub_type = file_type.split("/", 1)
|
||||||
|
if sub_type == "pdf":
|
||||||
|
# tesseract only supports image of at most 32K pixels
|
||||||
|
# depending on the number of pages, we have to either split
|
||||||
|
# into different batches or reduce the dpi;
|
||||||
|
# The maximum width and height are 32767.
|
||||||
|
image_data = self._index_ocr_get_data_pdf(bin_data, dpi) # TODO
|
||||||
|
else:
|
||||||
|
image_data = BytesIO()
|
||||||
|
try:
|
||||||
|
i = Image.open(BytesIO(bin_data))
|
||||||
|
i.save(image_data, "png", dpi=(dpi, dpi))
|
||||||
|
except IOError:
|
||||||
|
_logger.exception("Failed to OCR image")
|
||||||
|
return None
|
||||||
|
process = subprocess.Popen(
|
||||||
|
["tesseract", "stdin", "stdout"],
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, stderr = process.communicate(image_data.getvalue())
|
||||||
|
if process.returncode:
|
||||||
|
_logger.error("Error during OCR: %s", stderr)
|
||||||
|
return stdout.decode("utf-8")
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _index_ocr_get_data_pdf(self, bin_data, dpi):
|
||||||
|
process = subprocess.Popen(
|
||||||
|
["convert", "-density", str(dpi), "-", "-append", "png32:-"],
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, stderr = process.communicate(bin_data)
|
||||||
|
if stderr:
|
||||||
|
_logger.error("Error converting to PDF: %s", stderr)
|
||||||
|
return BytesIO(stdout)
|
||||||
|
|
||||||
|
@api.model
|
||||||
|
def _ocr_cron(self, limit=None):
|
||||||
|
domain = [("index_content", "=", _MARKER_PHRASE)]
|
||||||
|
recs = self.with_context(ocr_force=True).search(domain, limit=limit)
|
||||||
|
recs.perform_ocr()
|
||||||
|
|
||||||
|
def perform_ocr(self):
|
||||||
|
for rec in self:
|
||||||
|
if not rec.datas:
|
||||||
|
index_content = "" # the _MARKER_PHRASE should be removed
|
||||||
|
else:
|
||||||
|
bin_data = base64.b64decode(rec.datas)
|
||||||
|
ctx = {"ocr_force": True}
|
||||||
|
index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype)
|
||||||
|
rec.write({"index_content": index_content})
|
3
attachment_indexation_ocr/readme/CONFIGURE.rst
Normal file
3
attachment_indexation_ocr/readme/CONFIGURE.rst
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
To configure this module, go to:
|
||||||
|
|
||||||
|
#. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*
|
2
attachment_indexation_ocr/readme/CONTRIBUTORS.rst
Normal file
2
attachment_indexation_ocr/readme/CONTRIBUTORS.rst
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
* Holger Brunn <hbrunn@therp.nl>
|
||||||
|
* len-foss <nans.lefebvre@gmail.com>
|
3
attachment_indexation_ocr/readme/DESCRIPTION.rst
Normal file
3
attachment_indexation_ocr/readme/DESCRIPTION.rst
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
|
||||||
|
|
||||||
|
It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
|
9
attachment_indexation_ocr/readme/INSTALL.rst
Normal file
9
attachment_indexation_ocr/readme/INSTALL.rst
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
To install this module, you need to:
|
||||||
|
|
||||||
|
#. install tesseract and the language(s) your documents use
|
||||||
|
#. if you want to support OCR on PDFs, install imagemagick
|
||||||
|
#. install the module itself
|
||||||
|
|
||||||
|
On an Debian or Ubuntu system you would typically run::
|
||||||
|
|
||||||
|
$ sudo apt-get install tesseract-ocr imagemagick
|
4
attachment_indexation_ocr/readme/USAGE.rst
Normal file
4
attachment_indexation_ocr/readme/USAGE.rst
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
By default, character recognition is done asynchronously by a cronjob at night.
|
||||||
|
This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
|
||||||
|
The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
|
||||||
|
In case you want to force the OCR to be done immediately, set configuration parameter ``ocr.synchronous`` to value ``True``.
|
Before Width: | Height: | Size: 9.2 KiB After Width: | Height: | Size: 9.2 KiB |
453
attachment_indexation_ocr/static/description/index.html
Normal file
453
attachment_indexation_ocr/static/description/index.html
Normal file
@ -0,0 +1,453 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||||
|
<meta name="generator" content="Docutils: http://docutils.sourceforge.net/" />
|
||||||
|
<title>OCR for documents</title>
|
||||||
|
<style type="text/css">
|
||||||
|
|
||||||
|
/*
|
||||||
|
:Author: David Goodger (goodger@python.org)
|
||||||
|
:Id: $Id: html4css1.css 7952 2016-07-26 18:15:59Z milde $
|
||||||
|
:Copyright: This stylesheet has been placed in the public domain.
|
||||||
|
|
||||||
|
Default cascading style sheet for the HTML output of Docutils.
|
||||||
|
|
||||||
|
See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
|
||||||
|
customize this style sheet.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* used to remove borders from tables and images */
|
||||||
|
.borderless, table.borderless td, table.borderless th {
|
||||||
|
border: 0 }
|
||||||
|
|
||||||
|
table.borderless td, table.borderless th {
|
||||||
|
/* Override padding for "table.docutils td" with "! important".
|
||||||
|
The right padding separates the table cells. */
|
||||||
|
padding: 0 0.5em 0 0 ! important }
|
||||||
|
|
||||||
|
.first {
|
||||||
|
/* Override more specific margin styles with "! important". */
|
||||||
|
margin-top: 0 ! important }
|
||||||
|
|
||||||
|
.last, .with-subtitle {
|
||||||
|
margin-bottom: 0 ! important }
|
||||||
|
|
||||||
|
.hidden {
|
||||||
|
display: none }
|
||||||
|
|
||||||
|
.subscript {
|
||||||
|
vertical-align: sub;
|
||||||
|
font-size: smaller }
|
||||||
|
|
||||||
|
.superscript {
|
||||||
|
vertical-align: super;
|
||||||
|
font-size: smaller }
|
||||||
|
|
||||||
|
a.toc-backref {
|
||||||
|
text-decoration: none ;
|
||||||
|
color: black }
|
||||||
|
|
||||||
|
blockquote.epigraph {
|
||||||
|
margin: 2em 5em ; }
|
||||||
|
|
||||||
|
dl.docutils dd {
|
||||||
|
margin-bottom: 0.5em }
|
||||||
|
|
||||||
|
object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] {
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Uncomment (and remove this text!) to get bold-faced definition list terms
|
||||||
|
dl.docutils dt {
|
||||||
|
font-weight: bold }
|
||||||
|
*/
|
||||||
|
|
||||||
|
div.abstract {
|
||||||
|
margin: 2em 5em }
|
||||||
|
|
||||||
|
div.abstract p.topic-title {
|
||||||
|
font-weight: bold ;
|
||||||
|
text-align: center }
|
||||||
|
|
||||||
|
div.admonition, div.attention, div.caution, div.danger, div.error,
|
||||||
|
div.hint, div.important, div.note, div.tip, div.warning {
|
||||||
|
margin: 2em ;
|
||||||
|
border: medium outset ;
|
||||||
|
padding: 1em }
|
||||||
|
|
||||||
|
div.admonition p.admonition-title, div.hint p.admonition-title,
|
||||||
|
div.important p.admonition-title, div.note p.admonition-title,
|
||||||
|
div.tip p.admonition-title {
|
||||||
|
font-weight: bold ;
|
||||||
|
font-family: sans-serif }
|
||||||
|
|
||||||
|
div.attention p.admonition-title, div.caution p.admonition-title,
|
||||||
|
div.danger p.admonition-title, div.error p.admonition-title,
|
||||||
|
div.warning p.admonition-title, .code .error {
|
||||||
|
color: red ;
|
||||||
|
font-weight: bold ;
|
||||||
|
font-family: sans-serif }
|
||||||
|
|
||||||
|
/* Uncomment (and remove this text!) to get reduced vertical space in
|
||||||
|
compound paragraphs.
|
||||||
|
div.compound .compound-first, div.compound .compound-middle {
|
||||||
|
margin-bottom: 0.5em }
|
||||||
|
|
||||||
|
div.compound .compound-last, div.compound .compound-middle {
|
||||||
|
margin-top: 0.5em }
|
||||||
|
*/
|
||||||
|
|
||||||
|
div.dedication {
|
||||||
|
margin: 2em 5em ;
|
||||||
|
text-align: center ;
|
||||||
|
font-style: italic }
|
||||||
|
|
||||||
|
div.dedication p.topic-title {
|
||||||
|
font-weight: bold ;
|
||||||
|
font-style: normal }
|
||||||
|
|
||||||
|
div.figure {
|
||||||
|
margin-left: 2em ;
|
||||||
|
margin-right: 2em }
|
||||||
|
|
||||||
|
div.footer, div.header {
|
||||||
|
clear: both;
|
||||||
|
font-size: smaller }
|
||||||
|
|
||||||
|
div.line-block {
|
||||||
|
display: block ;
|
||||||
|
margin-top: 1em ;
|
||||||
|
margin-bottom: 1em }
|
||||||
|
|
||||||
|
div.line-block div.line-block {
|
||||||
|
margin-top: 0 ;
|
||||||
|
margin-bottom: 0 ;
|
||||||
|
margin-left: 1.5em }
|
||||||
|
|
||||||
|
div.sidebar {
|
||||||
|
margin: 0 0 0.5em 1em ;
|
||||||
|
border: medium outset ;
|
||||||
|
padding: 1em ;
|
||||||
|
background-color: #ffffee ;
|
||||||
|
width: 40% ;
|
||||||
|
float: right ;
|
||||||
|
clear: right }
|
||||||
|
|
||||||
|
div.sidebar p.rubric {
|
||||||
|
font-family: sans-serif ;
|
||||||
|
font-size: medium }
|
||||||
|
|
||||||
|
div.system-messages {
|
||||||
|
margin: 5em }
|
||||||
|
|
||||||
|
div.system-messages h1 {
|
||||||
|
color: red }
|
||||||
|
|
||||||
|
div.system-message {
|
||||||
|
border: medium outset ;
|
||||||
|
padding: 1em }
|
||||||
|
|
||||||
|
div.system-message p.system-message-title {
|
||||||
|
color: red ;
|
||||||
|
font-weight: bold }
|
||||||
|
|
||||||
|
div.topic {
|
||||||
|
margin: 2em }
|
||||||
|
|
||||||
|
h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
|
||||||
|
h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
|
||||||
|
margin-top: 0.4em }
|
||||||
|
|
||||||
|
h1.title {
|
||||||
|
text-align: center }
|
||||||
|
|
||||||
|
h2.subtitle {
|
||||||
|
text-align: center }
|
||||||
|
|
||||||
|
hr.docutils {
|
||||||
|
width: 75% }
|
||||||
|
|
||||||
|
img.align-left, .figure.align-left, object.align-left, table.align-left {
|
||||||
|
clear: left ;
|
||||||
|
float: left ;
|
||||||
|
margin-right: 1em }
|
||||||
|
|
||||||
|
img.align-right, .figure.align-right, object.align-right, table.align-right {
|
||||||
|
clear: right ;
|
||||||
|
float: right ;
|
||||||
|
margin-left: 1em }
|
||||||
|
|
||||||
|
img.align-center, .figure.align-center, object.align-center {
|
||||||
|
display: block;
|
||||||
|
margin-left: auto;
|
||||||
|
margin-right: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
table.align-center {
|
||||||
|
margin-left: auto;
|
||||||
|
margin-right: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.align-left {
|
||||||
|
text-align: left }
|
||||||
|
|
||||||
|
.align-center {
|
||||||
|
clear: both ;
|
||||||
|
text-align: center }
|
||||||
|
|
||||||
|
.align-right {
|
||||||
|
text-align: right }
|
||||||
|
|
||||||
|
/* reset inner alignment in figures */
|
||||||
|
div.align-right {
|
||||||
|
text-align: inherit }
|
||||||
|
|
||||||
|
/* div.align-center * { */
|
||||||
|
/* text-align: left } */
|
||||||
|
|
||||||
|
.align-top {
|
||||||
|
vertical-align: top }
|
||||||
|
|
||||||
|
.align-middle {
|
||||||
|
vertical-align: middle }
|
||||||
|
|
||||||
|
.align-bottom {
|
||||||
|
vertical-align: bottom }
|
||||||
|
|
||||||
|
ol.simple, ul.simple {
|
||||||
|
margin-bottom: 1em }
|
||||||
|
|
||||||
|
ol.arabic {
|
||||||
|
list-style: decimal }
|
||||||
|
|
||||||
|
ol.loweralpha {
|
||||||
|
list-style: lower-alpha }
|
||||||
|
|
||||||
|
ol.upperalpha {
|
||||||
|
list-style: upper-alpha }
|
||||||
|
|
||||||
|
ol.lowerroman {
|
||||||
|
list-style: lower-roman }
|
||||||
|
|
||||||
|
ol.upperroman {
|
||||||
|
list-style: upper-roman }
|
||||||
|
|
||||||
|
p.attribution {
|
||||||
|
text-align: right ;
|
||||||
|
margin-left: 50% }
|
||||||
|
|
||||||
|
p.caption {
|
||||||
|
font-style: italic }
|
||||||
|
|
||||||
|
p.credits {
|
||||||
|
font-style: italic ;
|
||||||
|
font-size: smaller }
|
||||||
|
|
||||||
|
p.label {
|
||||||
|
white-space: nowrap }
|
||||||
|
|
||||||
|
p.rubric {
|
||||||
|
font-weight: bold ;
|
||||||
|
font-size: larger ;
|
||||||
|
color: maroon ;
|
||||||
|
text-align: center }
|
||||||
|
|
||||||
|
p.sidebar-title {
|
||||||
|
font-family: sans-serif ;
|
||||||
|
font-weight: bold ;
|
||||||
|
font-size: larger }
|
||||||
|
|
||||||
|
p.sidebar-subtitle {
|
||||||
|
font-family: sans-serif ;
|
||||||
|
font-weight: bold }
|
||||||
|
|
||||||
|
p.topic-title {
|
||||||
|
font-weight: bold }
|
||||||
|
|
||||||
|
pre.address {
|
||||||
|
margin-bottom: 0 ;
|
||||||
|
margin-top: 0 ;
|
||||||
|
font: inherit }
|
||||||
|
|
||||||
|
pre.literal-block, pre.doctest-block, pre.math, pre.code {
|
||||||
|
margin-left: 2em ;
|
||||||
|
margin-right: 2em }
|
||||||
|
|
||||||
|
pre.code .ln { color: grey; } /* line numbers */
|
||||||
|
pre.code, code { background-color: #eeeeee }
|
||||||
|
pre.code .comment, code .comment { color: #5C6576 }
|
||||||
|
pre.code .keyword, code .keyword { color: #3B0D06; font-weight: bold }
|
||||||
|
pre.code .literal.string, code .literal.string { color: #0C5404 }
|
||||||
|
pre.code .name.builtin, code .name.builtin { color: #352B84 }
|
||||||
|
pre.code .deleted, code .deleted { background-color: #DEB0A1}
|
||||||
|
pre.code .inserted, code .inserted { background-color: #A3D289}
|
||||||
|
|
||||||
|
span.classifier {
|
||||||
|
font-family: sans-serif ;
|
||||||
|
font-style: oblique }
|
||||||
|
|
||||||
|
span.classifier-delimiter {
|
||||||
|
font-family: sans-serif ;
|
||||||
|
font-weight: bold }
|
||||||
|
|
||||||
|
span.interpreted {
|
||||||
|
font-family: sans-serif }
|
||||||
|
|
||||||
|
span.option {
|
||||||
|
white-space: nowrap }
|
||||||
|
|
||||||
|
span.pre {
|
||||||
|
white-space: pre }
|
||||||
|
|
||||||
|
span.problematic {
|
||||||
|
color: red }
|
||||||
|
|
||||||
|
span.section-subtitle {
|
||||||
|
/* font-size relative to parent (h1..h6 element) */
|
||||||
|
font-size: 80% }
|
||||||
|
|
||||||
|
table.citation {
|
||||||
|
border-left: solid 1px gray;
|
||||||
|
margin-left: 1px }
|
||||||
|
|
||||||
|
table.docinfo {
|
||||||
|
margin: 2em 4em }
|
||||||
|
|
||||||
|
table.docutils {
|
||||||
|
margin-top: 0.5em ;
|
||||||
|
margin-bottom: 0.5em }
|
||||||
|
|
||||||
|
table.footnote {
|
||||||
|
border-left: solid 1px black;
|
||||||
|
margin-left: 1px }
|
||||||
|
|
||||||
|
table.docutils td, table.docutils th,
|
||||||
|
table.docinfo td, table.docinfo th {
|
||||||
|
padding-left: 0.5em ;
|
||||||
|
padding-right: 0.5em ;
|
||||||
|
vertical-align: top }
|
||||||
|
|
||||||
|
table.docutils th.field-name, table.docinfo th.docinfo-name {
|
||||||
|
font-weight: bold ;
|
||||||
|
text-align: left ;
|
||||||
|
white-space: nowrap ;
|
||||||
|
padding-left: 0 }
|
||||||
|
|
||||||
|
/* "booktabs" style (no vertical lines) */
|
||||||
|
table.docutils.booktabs {
|
||||||
|
border: 0px;
|
||||||
|
border-top: 2px solid;
|
||||||
|
border-bottom: 2px solid;
|
||||||
|
border-collapse: collapse;
|
||||||
|
}
|
||||||
|
table.docutils.booktabs * {
|
||||||
|
border: 0px;
|
||||||
|
}
|
||||||
|
table.docutils.booktabs th {
|
||||||
|
border-bottom: thin solid;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
|
||||||
|
h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
|
||||||
|
font-size: 100% }
|
||||||
|
|
||||||
|
ul.auto-toc {
|
||||||
|
list-style-type: none }
|
||||||
|
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="document" id="ocr-for-documents">
|
||||||
|
<h1 class="title">OCR for documents</h1>
|
||||||
|
|
||||||
|
<!-- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
!! This file is generated by oca-gen-addon-readme !!
|
||||||
|
!! changes will be overwritten. !!
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
!! source digest: sha256:488ceb3b031015c08770a769f1357f5dcd462d28eaca37048790a61ef9a5feab
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -->
|
||||||
|
<p><a class="reference external" href="https://odoo-community.org/page/development-status"><img alt="Beta" src="https://img.shields.io/badge/maturity-Beta-yellow.png" /></a> <a class="reference external" href="http://www.gnu.org/licenses/agpl-3.0-standalone.html"><img alt="License: AGPL-3" src="https://img.shields.io/badge/licence-AGPL--3-blue.png" /></a> <a class="reference external" href="https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr"><img alt="OCA/knowledge" src="https://img.shields.io/badge/github-OCA%2Fknowledge-lightgray.png?logo=github" /></a> <a class="reference external" href="https://translation.odoo-community.org/projects/knowledge-16-0/knowledge-16-0-attachment_indexation_ocr"><img alt="Translate me on Weblate" src="https://img.shields.io/badge/weblate-Translate%20me-F47D42.png" /></a> <a class="reference external" href="https://runboat.odoo-community.org/builds?repo=OCA/knowledge&target_branch=16.0"><img alt="Try me on Runboat" src="https://img.shields.io/badge/runboat-Try%20me-875A7B.png" /></a></p>
|
||||||
|
<p>This module was written to make uploaded documents, for example scans, searchable by running OCR on them.</p>
|
||||||
|
<p>It supports all image formats <a class="reference external" href="http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html">Pillow supports</a> for reading and PDFs.</p>
|
||||||
|
<p><strong>Table of contents</strong></p>
|
||||||
|
<div class="contents local topic" id="contents">
|
||||||
|
<ul class="simple">
|
||||||
|
<li><a class="reference internal" href="#installation" id="id1">Installation</a></li>
|
||||||
|
<li><a class="reference internal" href="#configuration" id="id2">Configuration</a></li>
|
||||||
|
<li><a class="reference internal" href="#usage" id="id3">Usage</a></li>
|
||||||
|
<li><a class="reference internal" href="#bug-tracker" id="id4">Bug Tracker</a></li>
|
||||||
|
<li><a class="reference internal" href="#credits" id="id5">Credits</a><ul>
|
||||||
|
<li><a class="reference internal" href="#authors" id="id6">Authors</a></li>
|
||||||
|
<li><a class="reference internal" href="#contributors" id="id7">Contributors</a></li>
|
||||||
|
<li><a class="reference internal" href="#maintainers" id="id8">Maintainers</a></li>
|
||||||
|
</ul>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
<div class="section" id="installation">
|
||||||
|
<h1><a class="toc-backref" href="#id1">Installation</a></h1>
|
||||||
|
<p>To install this module, you need to:</p>
|
||||||
|
<ol class="arabic simple">
|
||||||
|
<li>install tesseract and the language(s) your documents use</li>
|
||||||
|
<li>if you want to support OCR on PDFs, install imagemagick</li>
|
||||||
|
<li>install the module itself</li>
|
||||||
|
</ol>
|
||||||
|
<p>On an Debian or Ubuntu system you would typically run:</p>
|
||||||
|
<pre class="literal-block">
|
||||||
|
$ sudo apt-get install tesseract-ocr imagemagick
|
||||||
|
</pre>
|
||||||
|
</div>
|
||||||
|
<div class="section" id="configuration">
|
||||||
|
<h1><a class="toc-backref" href="#id2">Configuration</a></h1>
|
||||||
|
<p>To configure this module, go to:</p>
|
||||||
|
<ol class="arabic simple">
|
||||||
|
<li>Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*</li>
|
||||||
|
</ol>
|
||||||
|
</div>
|
||||||
|
<div class="section" id="usage">
|
||||||
|
<h1><a class="toc-backref" href="#id3">Usage</a></h1>
|
||||||
|
<p>By default, character recognition is done asynchronously by a cronjob at night.
|
||||||
|
This is because the recognition process takes a while and you don’t want to make your users wait for the indexation to finish.
|
||||||
|
The interval to run the cronjob can be adjusted to your needs in the <tt class="docutils literal">Scheduled Actions</tt> menu, under ` <cite>Settings`</cite>.
|
||||||
|
In case you want to force the OCR to be done immediately, set configuration parameter <tt class="docutils literal">ocr.synchronous</tt> to value <tt class="docutils literal">True</tt>.</p>
|
||||||
|
</div>
|
||||||
|
<div class="section" id="bug-tracker">
|
||||||
|
<h1><a class="toc-backref" href="#id4">Bug Tracker</a></h1>
|
||||||
|
<p>Bugs are tracked on <a class="reference external" href="https://github.com/OCA/knowledge/issues">GitHub Issues</a>.
|
||||||
|
In case of trouble, please check there if your issue has already been reported.
|
||||||
|
If you spotted it first, help us to smash it by providing a detailed and welcomed
|
||||||
|
<a class="reference external" href="https://github.com/OCA/knowledge/issues/new?body=module:%20attachment_indexation_ocr%0Aversion:%2016.0%0A%0A**Steps%20to%20reproduce**%0A-%20...%0A%0A**Current%20behavior**%0A%0A**Expected%20behavior**">feedback</a>.</p>
|
||||||
|
<p>Do not contact contributors directly about support or help with technical issues.</p>
|
||||||
|
</div>
|
||||||
|
<div class="section" id="credits">
|
||||||
|
<h1><a class="toc-backref" href="#id5">Credits</a></h1>
|
||||||
|
<div class="section" id="authors">
|
||||||
|
<h2><a class="toc-backref" href="#id6">Authors</a></h2>
|
||||||
|
<ul class="simple">
|
||||||
|
<li>Therp BV</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
<div class="section" id="contributors">
|
||||||
|
<h2><a class="toc-backref" href="#id7">Contributors</a></h2>
|
||||||
|
<ul class="simple">
|
||||||
|
<li>Holger Brunn <<a class="reference external" href="mailto:hbrunn@therp.nl">hbrunn@therp.nl</a>></li>
|
||||||
|
<li>len-foss <<a class="reference external" href="mailto:nans.lefebvre@gmail.com">nans.lefebvre@gmail.com</a>></li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
<div class="section" id="maintainers">
|
||||||
|
<h2><a class="toc-backref" href="#id8">Maintainers</a></h2>
|
||||||
|
<p>This module is maintained by the OCA.</p>
|
||||||
|
<a class="reference external image-reference" href="https://odoo-community.org"><img alt="Odoo Community Association" src="https://odoo-community.org/logo.png" /></a>
|
||||||
|
<p>OCA, or the Odoo Community Association, is a nonprofit organization whose
|
||||||
|
mission is to support the collaborative development of Odoo features and
|
||||||
|
promote its widespread use.</p>
|
||||||
|
<p>This module is part of the <a class="reference external" href="https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr">OCA/knowledge</a> project on GitHub.</p>
|
||||||
|
<p>You are welcome to contribute. To learn how please visit <a class="reference external" href="https://odoo-community.org/page/Contribute">https://odoo-community.org/page/Contribute</a>.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
65
attachment_indexation_ocr/tests/test_document_ocr.py
Normal file
65
attachment_indexation_ocr/tests/test_document_ocr.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
# © 2016 Therp BV <http://therp.nl>
|
||||||
|
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
||||||
|
import base64
|
||||||
|
import subprocess
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
from odoo.tests.common import TransactionCase
|
||||||
|
|
||||||
|
from ..models.ir_attachment import _MARKER_PHRASE
|
||||||
|
|
||||||
|
|
||||||
|
def _get_some_system_font():
|
||||||
|
"""Get a font that is available on the system"""
|
||||||
|
output = subprocess.check_output(["fc-list"])
|
||||||
|
for line in output.splitlines():
|
||||||
|
line = line.decode("utf-8")
|
||||||
|
if "otf" in line.lower() and "roman" in line.lower():
|
||||||
|
return line.split(":")[0]
|
||||||
|
raise RuntimeError("No suitable font found!")
|
||||||
|
|
||||||
|
|
||||||
|
font_path = _get_some_system_font()
|
||||||
|
ir_config_parameter_key = "ocr.synchronous"
|
||||||
|
result_string = "Hello world"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_image_data(frmt="png"):
|
||||||
|
test_image = Image.new("RGB", (200, 30))
|
||||||
|
draw = ImageDraw.Draw(test_image)
|
||||||
|
draw.text((3, 3), result_string, font=ImageFont.truetype(font_path, 24))
|
||||||
|
data = BytesIO()
|
||||||
|
test_image.save(data, frmt)
|
||||||
|
return data.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocumentOcr(TransactionCase):
|
||||||
|
def test_document_ocr_png(self):
|
||||||
|
self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
|
||||||
|
bin_data = _get_image_data("png")
|
||||||
|
result = self.env["ir.attachment"]._index(bin_data, "image/png")
|
||||||
|
self.assertEqual(result.strip(), result_string)
|
||||||
|
|
||||||
|
def test_document_ocr_ppm(self):
|
||||||
|
"""It works on images that don't have a specific mimetype"""
|
||||||
|
self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
|
||||||
|
bin_data = _get_image_data("ppm")
|
||||||
|
result = self.env["ir.attachment"]._index(bin_data, "application/octet-stream")
|
||||||
|
self.assertEqual(result.strip(), result_string)
|
||||||
|
|
||||||
|
def test_document_ocr_pdf(self):
|
||||||
|
self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "True")
|
||||||
|
bin_data = _get_image_data("pdf")
|
||||||
|
result = self.env["ir.attachment"]._index(bin_data, "application/pdf")
|
||||||
|
self.assertEqual(result.strip(), result_string)
|
||||||
|
|
||||||
|
def test_document_ocr_cron(self):
|
||||||
|
self.env["ir.config_parameter"].set_param(ir_config_parameter_key, "False")
|
||||||
|
bin_data = _get_image_data("png")
|
||||||
|
vals = {"name": "testattachment", "datas": base64.b64encode(bin_data)}
|
||||||
|
attachment = self.env["ir.attachment"].create(vals)
|
||||||
|
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
||||||
|
attachment._ocr_cron()
|
||||||
|
self.assertEqual(attachment.index_content.strip(), result_string)
|
@ -1,86 +0,0 @@
|
|||||||
.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
|
|
||||||
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
|
|
||||||
:alt: License: AGPL-3
|
|
||||||
|
|
||||||
=================
|
|
||||||
OCR for documents
|
|
||||||
=================
|
|
||||||
|
|
||||||
This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
|
|
||||||
|
|
||||||
It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
|
|
||||||
|
|
||||||
Installation
|
|
||||||
============
|
|
||||||
|
|
||||||
To install this module, you need to:
|
|
||||||
|
|
||||||
#. install tesseract and the language(s) your documents use
|
|
||||||
#. if you want to support OCR on PDFs, install imagemagick
|
|
||||||
#. install the module itself
|
|
||||||
|
|
||||||
On an Debian or Ubuntu system you would typically run::
|
|
||||||
|
|
||||||
$ sudo apt-get install tesseract-ocr imagemagick
|
|
||||||
|
|
||||||
|
|
||||||
Configuration
|
|
||||||
=============
|
|
||||||
|
|
||||||
To configure this module, go to:
|
|
||||||
|
|
||||||
#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.*
|
|
||||||
|
|
||||||
Usage
|
|
||||||
=====
|
|
||||||
|
|
||||||
By default, character recognition is done asynchronously by a cronjob at night.
|
|
||||||
This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
|
|
||||||
The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
|
|
||||||
In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``.
|
|
||||||
|
|
||||||
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
|
|
||||||
:alt: Try me on Runbot
|
|
||||||
:target: https://runbot.odoo-community.org/runbot/118/8.0
|
|
||||||
|
|
||||||
Bug Tracker
|
|
||||||
===========
|
|
||||||
|
|
||||||
Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
|
|
||||||
In case of trouble, please check there if your issue has already been reported.
|
|
||||||
If you spotted it first, help us smashing it by providing a detailed and welcomed feedback.
|
|
||||||
|
|
||||||
Credits
|
|
||||||
=======
|
|
||||||
|
|
||||||
The actual work
|
|
||||||
---------------
|
|
||||||
|
|
||||||
* `tesseract <https://github.com/tesseract-ocr>`_
|
|
||||||
|
|
||||||
Images
|
|
||||||
------
|
|
||||||
|
|
||||||
* Odoo Community Association: `Icon <https://github.com/OCA/maintainer-tools/blob/master/template/module/static/description/icon.svg>`_.
|
|
||||||
|
|
||||||
Contributors
|
|
||||||
------------
|
|
||||||
|
|
||||||
* Holger Brunn <hbrunn@therp.nl>
|
|
||||||
|
|
||||||
Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list <mailto:community@mail.odoo.com>`_ or the `appropriate specialized mailinglist <https://odoo-community.org/groups>`_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues.
|
|
||||||
|
|
||||||
Maintainer
|
|
||||||
----------
|
|
||||||
|
|
||||||
.. image:: https://odoo-community.org/logo.png
|
|
||||||
:alt: Odoo Community Association
|
|
||||||
:target: https://odoo-community.org
|
|
||||||
|
|
||||||
This module is maintained by the OCA.
|
|
||||||
|
|
||||||
OCA, or the Odoo Community Association, is a nonprofit organization whose
|
|
||||||
mission is to support the collaborative development of Odoo features and
|
|
||||||
promote its widespread use.
|
|
||||||
|
|
||||||
To contribute to this module, please visit https://odoo-community.org.
|
|
@ -1,98 +0,0 @@
|
|||||||
# © 2016 Therp BV <http://therp.nl>
|
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
|
||||||
import logging
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
from StringIO import StringIO
|
|
||||||
|
|
||||||
from odoo import api, models
|
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
|
||||||
_MARKER_PHRASE = "[[waiting for OCR]]"
|
|
||||||
|
|
||||||
|
|
||||||
class IrAttachment(models.Model):
|
|
||||||
_inherit = "ir.attachment"
|
|
||||||
|
|
||||||
@api.model
|
|
||||||
def _index(self, data, datas_fname, file_type):
|
|
||||||
mimetype, content = super(IrAttachment, self)._index(
|
|
||||||
data, datas_fname, file_type
|
|
||||||
)
|
|
||||||
if data and mimetype and (not content or content == "image"):
|
|
||||||
has_synchr_param = (
|
|
||||||
self.env["ir.config_parameter"].get_param(
|
|
||||||
"document_ocr.synchronous", "False"
|
|
||||||
)
|
|
||||||
== "True"
|
|
||||||
)
|
|
||||||
has_force_flag = self.env.context.get("document_ocr_force")
|
|
||||||
if has_synchr_param or has_force_flag:
|
|
||||||
content = self._index_ocr(mimetype, data, datas_fname, file_type)
|
|
||||||
else:
|
|
||||||
content = _MARKER_PHRASE
|
|
||||||
|
|
||||||
return mimetype, content
|
|
||||||
|
|
||||||
@api.model
|
|
||||||
def _index_ocr(self, mimetype, data, datas_fname, file_type):
|
|
||||||
dpi = int(self.env["ir.config_parameter"].get_param("document_ocr.dpi", "500"))
|
|
||||||
if "/" not in mimetype:
|
|
||||||
_logger.warning("Invalid mimetype %s", mimetype)
|
|
||||||
return None
|
|
||||||
top_type, sub_type = mimetype.split("/", 1)
|
|
||||||
if hasattr(self, "_index_ocr_get_data_%s" % sub_type):
|
|
||||||
image_data = getattr(self, "_index_ocr_get_data_%s" % sub_type)(
|
|
||||||
data, datas_fname, file_type, dpi
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
image_data = StringIO()
|
|
||||||
try:
|
|
||||||
Image.open(StringIO(data)).save(image_data, "png", dpi=(dpi, dpi))
|
|
||||||
except IOError:
|
|
||||||
_logger.exception("Failed to OCR image")
|
|
||||||
return None
|
|
||||||
process = subprocess.Popen(
|
|
||||||
["tesseract", "stdin", "stdout"],
|
|
||||||
stdin=subprocess.PIPE,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
)
|
|
||||||
stdout, stderr = process.communicate(image_data.getvalue())
|
|
||||||
if process.returncode:
|
|
||||||
_logger.error("Error during OCR: %s", stderr)
|
|
||||||
return stdout
|
|
||||||
|
|
||||||
@api.model
|
|
||||||
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
|
|
||||||
process = subprocess.Popen(
|
|
||||||
["convert", "-density", str(dpi), "-", "-append", "png32:-"],
|
|
||||||
stdin=subprocess.PIPE,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
)
|
|
||||||
stdout, stderr = process.communicate(data)
|
|
||||||
if stderr:
|
|
||||||
_logger.error("Error converting to PDF: %s", stderr)
|
|
||||||
return StringIO(stdout)
|
|
||||||
|
|
||||||
@api.model
|
|
||||||
def _ocr_cron(self, limit=0):
|
|
||||||
for this in self.with_context(document_ocr_force=True).search(
|
|
||||||
[
|
|
||||||
("index_content", "=", _MARKER_PHRASE),
|
|
||||||
],
|
|
||||||
limit=limit,
|
|
||||||
):
|
|
||||||
if not this.datas:
|
|
||||||
continue
|
|
||||||
file_type, index_content = this._index(
|
|
||||||
this.datas.decode("base64"), this.datas_fname, this.file_type
|
|
||||||
)
|
|
||||||
this.write(
|
|
||||||
{
|
|
||||||
"file_type": file_type,
|
|
||||||
"index_content": index_content,
|
|
||||||
}
|
|
||||||
)
|
|
@ -1,65 +0,0 @@
|
|||||||
# © 2016 Therp BV <http://therp.nl>
|
|
||||||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
|
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
|
||||||
from StringIO import StringIO
|
|
||||||
|
|
||||||
from odoo.tests.common import TransactionCase
|
|
||||||
from odoo.tools.misc import mute_logger
|
|
||||||
|
|
||||||
from ..models.ir_attachment import _MARKER_PHRASE
|
|
||||||
|
|
||||||
|
|
||||||
class TestDocumentOcr(TransactionCase):
|
|
||||||
def test_document_ocr(self):
|
|
||||||
self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "True")
|
|
||||||
test_image = Image.new("RGB", (200, 30))
|
|
||||||
draw = ImageDraw.Draw(test_image)
|
|
||||||
draw.text(
|
|
||||||
(3, 3),
|
|
||||||
"Hello world",
|
|
||||||
font=ImageFont.truetype(
|
|
||||||
"/usr/share/fonts/truetype/inconsolata/Inconsolata.otf", 24
|
|
||||||
),
|
|
||||||
)
|
|
||||||
# test a plain image
|
|
||||||
data = StringIO()
|
|
||||||
test_image.save(data, "png")
|
|
||||||
result = self.env["ir.attachment"]._index(data.getvalue(), "test.png", None)
|
|
||||||
self.assertEqual(result[1].strip(), "Hello world")
|
|
||||||
# should also work for pdfs if supported, protect against
|
|
||||||
# ancient pillows
|
|
||||||
if (
|
|
||||||
hasattr(Image, "registered_extensions")
|
|
||||||
and "PDF" in Image.registered_extensions().values()
|
|
||||||
):
|
|
||||||
data = StringIO()
|
|
||||||
test_image.save(data, "pdf", resolution=300)
|
|
||||||
result = self.env["ir.attachment"]._index(data.getvalue(), "test.pdf", None)
|
|
||||||
self.assertEqual(result[1].strip(), "Hello world")
|
|
||||||
# check cron
|
|
||||||
self.env["ir.config_parameter"].set_param("document_ocr.synchronous", "False")
|
|
||||||
attachment = self.env["ir.attachment"].create(
|
|
||||||
{
|
|
||||||
"name": "testattachment",
|
|
||||||
"datas": data.getvalue().encode("base64"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
self.assertEqual(attachment.index_content, _MARKER_PHRASE)
|
|
||||||
attachment._ocr_cron()
|
|
||||||
self.assertEqual(attachment.index_content.strip(), "Hello world")
|
|
||||||
# and for an unreadable image, we expect an error
|
|
||||||
if (
|
|
||||||
hasattr(Image, "registered_extensions")
|
|
||||||
and "PALM" in Image.registered_extensions().values()
|
|
||||||
):
|
|
||||||
self.env["ir.config_parameter"].set_param(
|
|
||||||
"document_ocr.synchronous", "True"
|
|
||||||
)
|
|
||||||
data = StringIO()
|
|
||||||
test_image = Image.new("1", (200, 30))
|
|
||||||
test_image.save(data, "Palm")
|
|
||||||
with mute_logger("openerp.addons.document_ocr.models.ir_attachment"):
|
|
||||||
result = self.env["ir.attachment"]._index(
|
|
||||||
data.getvalue(), "test.palm", None
|
|
||||||
)
|
|
||||||
self.assertEqual(result[1], None)
|
|
@ -0,0 +1 @@
|
|||||||
|
../../../../attachment_indexation_ocr
|
@ -1 +0,0 @@
|
|||||||
../../../../document_ocr
|
|
Loading…
Reference in New Issue
Block a user