knowledge/document_ocr/models/ir_attachment.py

88 lines
3.3 KiB
Python

# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import logging
import subprocess
from PIL import Image
from StringIO import StringIO
from openerp import api, models
_logger = logging.getLogger(__name__)
_MARKER_PHRASE = '[[waiting for OCR]]'
class IrAttachment(models.Model):
_inherit = 'ir.attachment'
@api.model
def _index(self, data, datas_fname, file_type):
mimetype, content = super(IrAttachment, self)._index(
data, datas_fname, file_type)
if data and mimetype and (not content or content == 'image'):
has_synchr_param = self.env['ir.config_parameter'].get_param(
'document_ocr.synchronous', 'False') == 'True'
has_force_flag = self.env.context.get('document_ocr_force')
if has_synchr_param or has_force_flag:
content = self._index_ocr(mimetype, data, datas_fname,
file_type)
else:
content = _MARKER_PHRASE
return mimetype, content
@api.model
def _index_ocr(self, mimetype, data, datas_fname, file_type):
dpi = int(
self.env['ir.config_parameter'].get_param(
'document_ocr.dpi', '500'))
if '/' not in mimetype:
_logger.warning('Invalid mimetype %s', mimetype)
return None
top_type, sub_type = mimetype.split('/', 1)
if hasattr(self, '_index_ocr_get_data_%s' % sub_type):
image_data = getattr(self, '_index_ocr_get_data_%s' % sub_type)(
data, datas_fname, file_type, dpi)
else:
image_data = StringIO()
try:
Image.open(StringIO(data)).save(image_data, 'png',
dpi=(dpi, dpi))
except IOError:
_logger.exception('Failed to OCR image')
return None
process = subprocess.Popen(
['tesseract', 'stdin', 'stdout'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate(image_data.getvalue())
if process.returncode:
_logger.error('Error during OCR: %s', stderr)
return stdout
@api.model
def _index_ocr_get_data_pdf(self, data, datas_fname, file_type, dpi):
process = subprocess.Popen(
['convert', '-density', str(dpi), '-', '-append', 'png32:-'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate(data)
if stderr:
_logger.error('Error converting to PDF: %s', stderr)
return StringIO(stdout)
@api.model
def _ocr_cron(self, limit=0):
for this in self.with_context(document_ocr_force=True).search([
('index_content', '=', _MARKER_PHRASE),
], limit=limit):
if not this.datas:
continue
file_type, index_content = this._index(
this.datas.decode('base64'), this.datas_fname, this.file_type)
this.write({
'file_type': file_type,
'index_content': index_content,
})