pipe.py 文件源码-python代码片段

pipe.py 文件源码

python

阅读 33 收藏 0 点赞 0 评论 0

项目：gae-dataflow 作者: amygdala 项目源码文件源码

def process(self, element):
    content_value = element.properties.get('text', None)
    text_line = ''
    if content_value:
      text_line = content_value.string_value

    words = set([x.lower() for x in re.findall(r'[A-Za-z\']+', text_line)])
    # You can add more stopwords if you want.  These words are not included
    # in the analysis.
    stopwords = [
        'a', 'amp', 'an', 'and', 'are', 'as', 'at', 'be', 'been',
        'but', 'by', 'co', 'do', 'for', 'has', 'have', 'he', 'her', 'his',
        'https', 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'not', 'of', 'on',
        'or', 'rt', 's', 'she', 'so', 't', 'than', 'that', 'the', 'they',
        'this', 'to', 'us', 'was', 'we', 'what', 'with', 'you', 'your',
        'who', 'when', 'via']
    stopwords += list(map(chr, range(97, 123)))
    pruned_words = list(words - set(stopwords))
    pruned_words.sort()
    import itertools
    return list(itertools.combinations(pruned_words, 2))