diff --git a/document_ocr/README.rst b/document_ocr/README.rst deleted file mode 100644 index c500f0b1..00000000 --- a/document_ocr/README.rst +++ /dev/null @@ -1,101 +0,0 @@ -.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg - :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html - :alt: License: AGPL-3 - -================= -OCR for documents -================= - -This module was written to make uploaded documents, for example scans, searchable by running OCR on them. - -It supports all image formats `Pillow supports `_ for reading and PDFs. - -Installation -============ - -To install this module, you need to: - -#. install tesseract and the language(s) your documents use -#. if you want to support OCR on PDFs, install imagemagick -#. install the module itself - -On an Debian or Ubuntu system you would typically run:: - - $ sudo apt-get install tesseract-ocr imagemagick - - -Configuration -============= - -To configure this module, go to: - -#. Settings/Technical/Parameters/System parameters and review the parameters with names document_ocr.* - -Usage -===== - -By default, character recognition is done asynchronously by a cronjob at night. -This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish. -The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``. -In case you want to force the OCR to be done immediately, set configuration parameter ``document_ocr.synchronous`` to value ``True``. - - -By default, recognition language is set to english. -In case you want to use a different default, set configuration parameter ``document_ocr.language`` to value respective value ex:``por``, for Portuguese. - - -In PDF case, OCR will run after it will be converted to an image. But OCR will be applied to all PDFs. - - -System parameters used: -#``document_ocr.synchronous``: bool -#``document_ocr.language``: string -#``document_ocr.dpi``: integer -#``document_ocr.quality``: integer - - -.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas - :alt: Try me on Runbot - :target: https://runbot.odoo-community.org/runbot/118/10.0 - -Bug Tracker -=========== - -Bugs are tracked on `GitHub Issues `_. -In case of trouble, please check there if your issue has already been reported. -If you spotted it first, help us smashing it by providing a detailed and welcomed feedback. - -Credits -======= - -The actual work ---------------- - -* `tesseract `_ - -Images ------- - -* Odoo Community Association: `Icon `_. - -Contributors ------------- - -* Holger Brunn - -Do not contact contributors directly about help with questions or problems concerning this addon, but use the `community mailing list `_ or the `appropriate specialized mailinglist `_ for help, and the bug tracker linked in `Bug Tracker`_ above for technical issues. - -Maintainer ----------- - -.. image:: https://odoo-community.org/logo.png - :alt: Odoo Community Association - :target: https://odoo-community.org - -This module is maintained by the OCA. - -OCA, or the Odoo Community Association, is a nonprofit organization whose -mission is to support the collaborative development of Odoo features and -promote its widespread use. - -To contribute to this module, please visit https://odoo-community.org. diff --git a/document_ocr/__init__.py b/document_ocr/__init__.py deleted file mode 100644 index 472456b6..00000000 --- a/document_ocr/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from . import models diff --git a/document_ocr/__manifest__.py b/document_ocr/__manifest__.py deleted file mode 100644 index ad012794..00000000 --- a/document_ocr/__manifest__.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -{ - "name": "OCR for Documents", - "version": "10.0.1.0.0", - "author": "Therp BV," - " Odoo Community Association (OCA)," - " ThinkOpen Solutions Brasil", - "license": "AGPL-3", - "category": "Knowledge Management", - "summary": "Run character recognition on uploaded files", - "depends": [ - 'document', - ], - "data": [ - "data/ir_cron.xml", - "data/ir_config_parameter.xml", - "views/ir_attachment_view.xml", - ], - "external_dependencies": { - 'bin': [ - 'tesseract', - 'convert', - ], - }, -} diff --git a/document_ocr/data/ir_config_parameter.xml b/document_ocr/data/ir_config_parameter.xml deleted file mode 100644 index 721a0740..00000000 --- a/document_ocr/data/ir_config_parameter.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - document_ocr.synchronous - False - - - document_ocr.dpi - 300 - - - document_ocr.quality - 100 - - - document_ocr.language - eng - - - diff --git a/document_ocr/data/ir_cron.xml b/document_ocr/data/ir_cron.xml deleted file mode 100644 index f69d151a..00000000 --- a/document_ocr/data/ir_cron.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - Run OCR on uploaded documents - days - 1 - ir.attachment - _ocr_cron - -1 - - - diff --git a/document_ocr/models/__init__.py b/document_ocr/models/__init__.py deleted file mode 100644 index 051b3ddf..00000000 --- a/document_ocr/models/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from . import ir_attachment diff --git a/document_ocr/models/ir_attachment.py b/document_ocr/models/ir_attachment.py deleted file mode 100644 index 18102d52..00000000 --- a/document_ocr/models/ir_attachment.py +++ /dev/null @@ -1,261 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). - -import io -import logging -import subprocess -from StringIO import StringIO - -import pyPdf -from odoo import api, fields, models, _ -from odoo.exceptions import UserError - -_logger = logging.getLogger(__name__) -_MARKER_PHRASE = '[[waiting for OCR]]' -OCR_LANGUAGE = [('afr', 'Afrikaans'), - ('amh', 'Amharic'), - ('ara', 'Arabic'), - ('asm', 'Assamese'), - ('aze', 'Azerbaijani'), - ('aze_cyrl', 'Azerbaijani - Cyrilic'), - ('bel', 'Belarusian'), - ('ben', 'Bengali'), - ('bod', 'Tibetan'), - ('bos', 'Bosnian'), - ('bul', 'Bulgarian'), - ('cat', 'Catalan; Valencian'), - ('ceb', 'Cebuano'), - ('ces', 'Czech'), - ('chi_sim', 'Chinese - Simplified'), - ('chi_tra', 'Chinese - Traditional'), - ('chr', 'Cherokee'), - ('cym', 'Welsh'), - ('dan', 'Danish'), - ('dan_frak', 'Danish - Fraktur'), - ('deu', 'German'), - ('deu_frak', 'German - Fraktur'), - ('dzo', 'Dzongkha'), - ('ell', 'Greek, Modern (1453-)'), - ('eng', 'English'), - ('enm', 'English, Middle (1100-1500)'), - ('epo', 'Esperanto'), - ('equ', 'Math / equation detection module'), - ('est', 'Estonian'), - ('eus', 'Basque'), - ('fas', 'Persian'), - ('fin', 'Finnish'), - ('fra', 'French'), - ('frk', 'Frankish'), - ('frm', 'French, Middle (ca.1400-1600)'), - ('gle', 'Irish'), - ('glg', 'Galician'), - ('grc', 'Greek, Ancient (to 1453)'), - ('guj', 'Gujarati'), - ('hat', 'Haitian; Haitian Creole'), - ('heb', 'Hebrew'), - ('hin', 'Hindi'), - ('hrv', 'Croatian'), - ('hun', 'Hungarian'), - ('iku', 'Inuktitut'), - ('ind', 'Indonesian'), - ('isl', 'Icelandic'), - ('ita', 'Italian'), - ('ita_old', 'Italian - Old'), - ('jav', 'Javanese'), - ('jpn', 'Japanese'), - ('kan', 'Kannada'), - ('kat', 'Georgian'), - ('kat_old', 'Georgian - Old'), - ('kaz', 'Kazakh'), - ('khm', 'Central Khmer'), - ('kir', 'Kirghiz; Kyrgyz'), - ('kor', 'Korean'), - ('kur', 'Kurdish'), - ('lao', 'Lao'), - ('lat', 'Latin'), - ('lav', 'Latvian'), - ('lit', 'Lithuanian'), - ('mal', 'Malayalam'), - ('mar', 'Marathi'), - ('mkd', 'Macedonian'), - ('mlt', 'Maltese'), - ('msa', 'Malay'), - ('mya', 'Burmese'), - ('nep', 'Nepali'), - ('nld', 'Dutch; Flemish'), - ('nor', 'Norwegian'), - ('ori', 'Oriya'), - ('osd', 'Orientation and script detection module'), - ('pan', 'Panjabi; Punjabi'), - ('pol', 'Polish'), - ('por', 'Portuguese'), - ('pus', 'Pushto; Pashto'), - ('ron', 'Romanian; Moldavian; Moldovan'), - ('rus', 'Russian'), - ('san', 'Sanskrit'), - ('sin', 'Sinhala; Sinhalese'), - ('slk', 'Slovak'), - ('slk_frak', 'Slovak - Fraktur'), - ('slv', 'Slovenian'), - ('spa', 'Spanish; Castilian'), - ('spa_old', 'Spanish; Castilian - Old'), - ('sqi', 'Albanian'), - ('srp', 'Serbian'), - ('srp_latn', 'Serbian - Latin'), - ('swa', 'Swahili'), - ('swe', 'Swedish'), - ('syr', 'Syriac'), - ('tam', 'Tamil'), - ('tel', 'Telugu'), - ('tgk', 'Tajik'), - ('tgl', 'Tagalog'), - ('tha', 'Thai'), - ('tir', 'Tigrinya'), - ('tur', 'Turkish'), - ('uig', 'Uighur; Uyghur'), - ('ukr', 'Ukrainian'), - ('urd', 'Urdu'), - ('uzb', 'Uzbek'), - ('uzb_cyrl', 'Uzbek - Cyrilic'), - ('vie', 'Vietnamese'), - ('yid', 'Yiddish'), ] - - -class IrAttachment(models.Model): - _inherit = 'ir.attachment' - - language = fields.Selection(OCR_LANGUAGE, 'Language', - default=lambda self: - self.env['ir.config_parameter'].get_param( - 'document_ocr.language', 'eng')) - # We need to redefine index_content field to be able to update it - # on the onchange_language() - index_content = fields.Text('Indexed Content', - readonly=False, - prefetch=False) - index_content_rel = fields.Text(related='index_content', - string='Indexed Content Rel') - - @api.onchange('language') - def onchange_language(self): - process = subprocess.Popen(['tesseract', '--list-langs'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = process.communicate() - if self.language not in stderr.split('\n'): - raise UserError(_( - "Language not installed." - " Please ask your system administrator to" - " install tesseract '%s' language." % - self.language)) - if self.store_fname: - bin_data = self._file_read(self.store_fname) - else: - bin_data = self.db_datas - if bin_data: - index_content = self._index( - bin_data.decode('base64'), self.datas_fname, self.mimetype) - return {'value': { - 'index_content': index_content}} - return {'value': {}} - - @api.model - def _index(self, bin_data, datas_fname, mimetype): - content = super(IrAttachment, self)._index( - bin_data, datas_fname, mimetype) - if not content or content == 'image': - has_synchr_param = self.env['ir.config_parameter'].get_param( - 'document_ocr.synchronous', 'False') == 'True' - has_force_flag = self.env.context.get('document_ocr_force') - synchr = has_synchr_param or has_force_flag - if synchr: - content = self._index_ocr(bin_data) - else: - content = _MARKER_PHRASE - return content - - def _index_ocr(self, bin_data): - _logger.info('OCR IMAGE "%s"...', self.datas_fname) - process = subprocess.Popen( - ['tesseract', 'stdin', 'stdout', '-l', self.language], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate(bin_data) - if stderr: - _logger.error('Error during OCR: %s', stderr) - return stdout - - def _index_pdf(self, bin_data): - - def convert_bin_to_image(self, bin_data): - dpi = int(self.env['ir.config_parameter'].get_param( - 'document_ocr.dpi', '500')) - quality = int(self.env['ir.config_parameter'].get_param( - 'document_ocr.quality', '100')) - process = subprocess.Popen( - ['convert', '-density', str(dpi), - '-quality', str(quality), - '-', '-append', 'png32:-'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = process.communicate(bin_data) - if stderr: - _logger.error('Error converting PDF to image: %s', stderr) - return stdout - - def _convert_pdf_page_to_image(self, pdf, pagenum): - dst_pdf = pyPdf.PdfFileWriter() - dst_pdf.addPage(pdf.getPage(pagenum)) - pdf_bytes = io.BytesIO() - dst_pdf.write(pdf_bytes) - pdf_bytes.seek(0) - return convert_bin_to_image(self, pdf_bytes.read()) - - has_synchr_param = self.env['ir.config_parameter'].get_param( - 'document_ocr.synchronous', 'False') == 'True' - has_force_flag = self.env.context.get('document_ocr_force') - synchr = has_synchr_param or has_force_flag - if synchr: - buf = super(IrAttachment, self)._index_pdf(bin_data) - if len(buf.split('\n')) < 2 and bin_data.startswith('%PDF-'): - # If we got less than 2 lines, - # run OCR anyway and append to existent text - try: - f = StringIO(bin_data) - pdf = pyPdf.PdfFileReader(f) - if pdf.getNumPages() > 1: - for pagenum in range(0, pdf.getNumPages()): - _logger.info('OCR PDF "%s" page %d/%d...', - self.datas_fname, - pagenum + 1, - pdf.getNumPages()) - pdf_image = _convert_pdf_page_to_image(self, pdf, - pagenum) - index_content = self._index_ocr(pdf_image) - buf = u'%s\n-- %d --\n%s' % ( - buf, pagenum + 1, index_content.decode('utf8')) - else: - pdf_image = convert_bin_to_image(self, bin_data) - index_content = self._index_ocr(pdf_image) - buf = u'%s\n%s' % (buf, index_content.decode('utf8')) - except Exception as e: - _logger.error('Error converting PDF to image: %s', e) - pass - else: - buf = _MARKER_PHRASE - return buf - - @api.model - def _ocr_cron(self): - for this in self.with_context(document_ocr_force=True).search( - [('index_content', '=', _MARKER_PHRASE)]): - if not this.datas: - continue - index_content = this._index( - this.datas.decode('base64'), this.datas_fname, this.mimetype) - this.write({ - 'index_content': index_content, - }) diff --git a/document_ocr/static/description/icon.png b/document_ocr/static/description/icon.png deleted file mode 100644 index 3a0328b5..00000000 Binary files a/document_ocr/static/description/icon.png and /dev/null differ diff --git a/document_ocr/tests/__init__.py b/document_ocr/tests/__init__.py deleted file mode 100644 index 7efb2857..00000000 --- a/document_ocr/tests/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from . import test_document_ocr diff --git a/document_ocr/tests/test_document_ocr.py b/document_ocr/tests/test_document_ocr.py deleted file mode 100644 index 1d1a5490..00000000 --- a/document_ocr/tests/test_document_ocr.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*- coding: utf-8 -*- -# © 2016 Therp BV -# © 2017 ThinkOpen Solutions -# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). -from StringIO import StringIO - -from PIL import Image, ImageDraw, ImageFont -from PIL import PdfImagePlugin, PalmImagePlugin # noqa # pylint: disable=unused-import -from odoo.tests.common import TransactionCase - -from ..models.ir_attachment import _MARKER_PHRASE - - -class TestDocumentOcr(TransactionCase): - def test_document_ocr(self): - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'True') - test_image = Image.new('RGB', (200, 30)) - draw = ImageDraw.Draw(test_image) - draw.text((3, 3), "Hello world", font=ImageFont.truetype( - '/usr/share/fonts/truetype/inconsolata/Inconsolata.otf', 24)) - # test a plain image - data = StringIO() - test_image.save(data, 'png') - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas_fname': 'test_png.pdf'}) - result = attachment._index( - data.getvalue(), 'test.png', None) - self.assertEqual(result.strip(), 'Hello world') - # should also work for pdfs - data = StringIO() - test_image.save(data, 'pdf', resolution=300) - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas_fname': 'test_pdf.pdf'}) - result = attachment._index( - data.getvalue(), 'test.pdf', None) - self.assertEqual(result.strip(), 'Hello world') - # check cron - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'False') - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas_fname': 'test_cron.pdf', - 'datas': data.getvalue().encode('base64'), - }) - self.assertEqual(attachment.index_content, _MARKER_PHRASE) - attachment._ocr_cron() - self.assertEqual(attachment.index_content.strip(), 'Hello world') - # and for an unreadable image, we expect an empty string - self.env['ir.config_parameter'].set_param( - 'document_ocr.synchronous', 'True') - data = StringIO() - test_image = Image.new('1', (200, 30)) - test_image.save(data, 'palm') - attachment = self.env['ir.attachment'].create({ - 'name': 'testattachment', - 'datas_fname': 'test_err.palm'}) - result = attachment._index( - data.getvalue(), 'test.palm', None) - self.assertEqual(result, '') diff --git a/document_ocr/views/ir_attachment_view.xml b/document_ocr/views/ir_attachment_view.xml deleted file mode 100644 index ed171d61..00000000 --- a/document_ocr/views/ir_attachment_view.xml +++ /dev/null @@ -1,43 +0,0 @@ - - - - - ir.attachment - - - - 1 - - - - - - - - - - - - - - ir.attachment - - - - - - - - - ir.attachment - - - - - - - - - - -