8.0 document rtf index (#110)

* [ENH] Add module document_rtf_index.

* [FIX] Improvements after review.

- Only load indexer when module is installed;
- Protect non standard imports;
- Standardized README.rst.
This commit is contained in:
Ronald Portier 2016-11-14 09:25:13 +01:00 committed by Holger Brunn
parent da0c01345b
commit bae6af55d3
11 changed files with 15221 additions and 0 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.rtf -diff

View File

@ -35,6 +35,7 @@ install:
- git clone --depth=1 https://github.com/OCA/maintainer-quality-tools.git ${HOME}/maintainer-quality-tools
- export PATH=${HOME}/maintainer-quality-tools/travis:${PATH}
- pip install --upgrade paramiko
- pip install --upgrade pyth
- travis_install_nightly
script:

View File

@ -0,0 +1,58 @@
.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
:alt: License: AGPL-3
Index rtf documents
===================
Indexing rtf documents can take a long time, especially when they contain
images. This module will convert rtf documents to only the text contents and
index that text.
Usage
=====
Just installing the module will register the rtf indexer. No further user
action or configuration required.
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
:alt: Try me on Runbot
:target: https://runbot.odoo-community.org/runbot/knowledge/8.0
.. repo_id is available in https://github.com/OCA/maintainer-tools/blob/master/tools/repos_with_ids.txt
.. branch is "8.0" for example
Bug Tracker
===========
Bugs are tracked on
`GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
In case of trouble, please check there if your issue has already been
reported. If you spotted it first, help us smashing it by providing a
detailed and welcomed feedback.
Credits
=======
Contributors
------------
* Ronald Portier <ronald@therp.nl>
* Icon courtesy of https://www.picol.org (refresh.svg) and
https://github.com/odoo/odoo/blob/8.0/addons/knowledge/static/description/icon.png
Maintainer
----------
.. image:: https://odoo-community.org/logo.png
:alt: Odoo Community Association
:target: https://odoo-community.org
This module is maintained by the OCA.
OCA, or the Odoo Community Association, is a nonprofit organization whose
mission is to support the collaborative development of Odoo features and
promote its widespread use.
To contribute to this module, please visit https://odoo-community.org.

View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import models

View File

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{
"name": "Index rtf documents",
"version": "8.0.1.0.0",
"author": "Therp BV, Odoo Community Association (OCA)",
"license": "AGPL-3",
"category": "Knowledge Management",
"summary": "Index rtf documents",
"depends": [
'document',
],
"data": [],
"auto_install": False,
"installable": True,
"application": False,
"external_dependencies": {
'python': [
'pyth',
],
},
}

View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import ir_attachment

View File

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import logging
from openerp import api, models
_logger = logging.getLogger(__name__)
class IrAttachment(models.Model):
_inherit = 'ir.attachment'
@api.noguess
def _register_hook(self, cr):
"""Only register our indexer if module is installed."""
import StringIO
try:
from pyth import document
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.plaintext.writer import PlaintextWriter
except ImportError:
_logger.warn("pyth not found, RTF indexing disabled.")
return
from openerp.addons.document.content_index import indexer, cntIndex
from openerp.addons.document.std_index import _to_unicode
def improved_paragraph(self, paragraph, prefix=""):
"""Override method to insert image ignoring code."""
content = []
for text in paragraph.content:
# Begin patch =========\
if text.__class__ is document.Image:
continue
# End patch ===========/
content.append(u"".join(text.content))
content = u"".join(content).encode("utf-8")
for line in content.split("\n"):
self.target.write(" " * self.indent)
self.target.write(prefix)
self.target.write(line)
self.target.write("\n")
if prefix:
prefix = " "
PlaintextWriter.paragraph = improved_paragraph
class RtfDoc(indexer):
"""Index Rich Text Format (RTF) files."""
def _getMimeTypes(self):
return [
'application/rtf',
'application/x-rtf',
'text/rtf',
'text/richtext',
]
def _getExtensions(self):
return [
'.rtf',
]
def _doIndexContent(self, content):
"""Just get text contents of rtf file."""
s = StringIO.StringIO(content)
r = Rtf15Reader.read(s) # r will be pyth.document.Document
s.close()
w = PlaintextWriter.write(r) # w will be cStringIO.StringO
result = _to_unicode(w.getvalue())
return result
cntIndex.register(RtfDoc())

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import test_rtf_index

View File

@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from openerp.tests.common import TransactionCase
from openerp.modules.module import get_module_resource
class TestIndexRtf(TransactionCase):
def test_index_rtf(self):
"""Test if the indexer indexes just the text in rtf documents."""
attachment_model = self.env['ir.attachment']
# Force loading of indexer (normally _register_hooks runs after tests)
attachment_model._register_hook(self.env.cr)
# we do this to avoid error messages about word files in demo data
attachment_model.search([]).unlink()
# Now take rather large rtf test file, with only few actual words:
rtf_path = get_module_resource(
'document_rtf_index',
'test_files',
'test_with_cat_image.rtf'
)
rtf_file = open(rtf_path, 'rb').read().encode('base64')
att1 = self.env['ir.attachment'].create({
'name': 'test_with_cat_image.rtf',
'datas_fname': 'test_with_cat_image.rtf',
'datas': rtf_file,
})
self.assertEqual(att1.file_type, 'application/rtf')
self.assertEqual(att1.index_content[:16], 'Hello rtf world!')