def process(self, element):
content_value = element.properties.get('text', None)
text_line = ''
if content_value:
text_line = content_value.string_value
words = set([x.lower() for x in re.findall(r'[A-Za-z\']+', text_line)])
# You can add more stopwords if you want. These words are not included
# in the analysis.
stopwords = [
'a', 'amp', 'an', 'and', 'are', 'as', 'at', 'be', 'been',
'but', 'by', 'co', 'do', 'for', 'has', 'have', 'he', 'her', 'his',
'https', 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'not', 'of', 'on',
'or', 'rt', 's', 'she', 'so', 't', 'than', 'that', 'the', 'they',
'this', 'to', 'us', 'was', 'we', 'what', 'with', 'you', 'your',
'who', 'when', 'via']
stopwords += list(map(chr, range(97, 123)))
pruned_words = list(words - set(stopwords))
pruned_words.sort()
import itertools
return list(itertools.combinations(pruned_words, 2))
评论列表
文章目录