Commit e404d186 authored by Clément Schreiner's avatar Clément Schreiner
Browse files

Use a simple regexp to extract words from a text, instead of previous stupid clean_word method.

parent 9b96745c
......@@ -29,6 +29,7 @@
# OTHER DEALINGS IN THE SOFTWARE.
import xapian
import re
"""
Classes for accessing apt-xapian-index
......@@ -46,20 +47,14 @@ class XapianQuery(object):
self.terms = []
@classmethod
def _clean_word(cls, word):
"""
Remove unwanted characters from a word and make it lowercase.
"""
return word.strip(':;,-.*\n').replace('<br />', '').lower()
@classmethod
def tokenize(cls, text):
"""
Takes a string and returns a list of words ready for a xapian
query.
"""
return [cls._clean_word(word) for word in text.split(' ')]
regexp = r'\w+'
return re.findall(regexp, text.lower())
def add_words(self, word_list):
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment