[ENH] Add module document_rtf_index.

This commit is contained in:
Ronald Portier 2016-11-03 16:36:24 +01:00
parent da0c01345b
commit 2f7bf15718
No known key found for this signature in database
GPG Key ID: A181F8124D7101D3
8 changed files with 15179 additions and 0 deletions

View File

@ -0,0 +1,37 @@
Index rtf documents
===================
Indexing rtf documents can take a long time, especially when they contain
images. This module will convert rtf documents to only the text contents and
index that text.
Usage
=====
Just installing the module will register the rtf indexer. No further user
action or configuration required.
Credits
=======
Contributors
------------
* Ronald Portier <ronald@therp.nl>
* Icon courtesy of http://www.picol.org (refresh.svg) and
https://github.com/odoo/odoo/blob/8.0/addons/knowledge/static/description/icon.png
Maintainer
----------
.. image:: http://odoo-community.org/logo.png
:alt: Odoo Community Association
:target: http://odoo-community.org
This module is maintained by the OCA.
OCA, or the Odoo Community Association, is a nonprofit organization whose
mission is to support the collaborative development of Odoo features and
promote its widespread use.
To contribute to this module, please visit http://odoo-community.org.

View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import std_index

View File

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
{
"name": "Index rtf documents",
"version": "8.0.1.0.0",
"author": "Therp BV, Odoo Community Association (OCA)",
"license": "AGPL-3",
"category": "Knowledge Management",
"summary": "Index rtf documents",
"depends": [
'document',
],
"data": [],
"auto_install": False,
"installable": True,
"application": False,
"external_dependencies": {
'python': [
'pyth',
],
},
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import StringIO
from pyth import document
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.plaintext.writer import PlaintextWriter
from openerp.addons.document.content_index import indexer, cntIndex
from openerp.addons.document.std_index import _to_unicode
def improved_paragraph(self, paragraph, prefix=""):
"""Override method to insert image ignoring code."""
content = []
for text in paragraph.content:
# Begin patch =========\
if text.__class__ is document.Image:
continue
# End patch ===========/
content.append(u"".join(text.content))
content = u"".join(content).encode("utf-8")
for line in content.split("\n"):
self.target.write(" " * self.indent)
self.target.write(prefix)
self.target.write(line)
self.target.write("\n")
if prefix: prefix = " "
PlaintextWriter.paragraph = improved_paragraph
class RtfDoc(indexer):
"""Index Rich Text Format (RTF) files."""
def _getMimeTypes(self):
return [
'application/rtf',
'application/x-rtf',
'text/rtf',
'text/richtext',
]
def _getExtensions(self):
return [
'.rtf',
]
def _doIndexContent(self, content):
"""Just get text contents of rtf file."""
s = StringIO.StringIO(content)
r = Rtf15Reader.read(s) # r will be pyth.document.Document
s.close()
w = PlaintextWriter.write(r) # w will be cStringIO.StringO
result = _to_unicode(w.getvalue())
return result
cntIndex.register(RtfDoc())

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
from . import test_rtf_index

View File

@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
# © 2016 Therp BV <http://therp.nl>
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
import base64
from openerp.tests.common import TransactionCase
from openerp.modules.module import get_module_resource
class TestIndexRtf(TransactionCase):
def test_index_rtf(self):
"""Test if the indexer indexes just the text in rtf documents."""
# we do this to avoid error messages about word files in demo data
self.env['ir.attachment'].search([]).unlink()
# Now take rather large rtf test file, with only few actual words:
rtf_path = get_module_resource(
'document_rtf_index',
'test_files',
'test_with_cat_image.rtf'
)
rtf_file = open(rtf_path, 'rb').read().encode('base64')
att1 = self.env['ir.attachment'].create({
'name': 'test_with_cat_image.rtf',
'datas_fname': 'test_with_cat_image.rtf',
'datas': rtf_file,
})
self.assertEqual(att1.file_type, 'application/rtf')
self.assertEqual(att1.index_content[:16], 'Hello rtf world!')