python类PorterStemmer()的实例源码

porter.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
knock72.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def mk_feature():
    d=defaultdict(lambda:0)
    stoplist=stopwords.words("english")+[",",".","!","?",";",":","\n","\t","(",")"," ",""]
    stemmer=stem.PorterStemmer()
    l=list()

    for line in open("sentiment.txt","r"):
        y=line.split(" ")[0]
        for item in line.strip("\n").split(" ")[1:]:
            item=stemmer.stem(item)
            if item not in stoplist:
                d[item]+=1
    for key,value in d.items():
        if value < 5:
            l+=[key]
    for key in l:
        del d[key]
    return(d)
porter.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
porter.py 文件源码 项目:neighborhood_mood_aws 作者: jarrellmark 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
porter.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.fileids()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)
knock72.py 文件源码 项目:100knock2017 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def make_feature(f, flag):
    with open(file_name) as f:
        feature = defaultdict(int)
        for i,line in enumerate(f):
            print(i)
            y,x = line.split('\t')
            y = int(y)
            words = x.split()
            for word in words:
                if stop_word_check(word) == False:
                    word = stem.PorterStemmer().stem(word)
                    if flag == 0:
                        feature[word] += 1
                    elif flag == 1:
                        if y == 1:
                            feature[word] += 1
                        elif y == -1:
                            feature[word] -= 1
    return feature
porter.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
porter.py 文件源码 项目:beepboop 作者: nicolehe 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
porter.py 文件源码 项目:kind2anki 作者: prz3m 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
porter.py 文件源码 项目:but_sentiment 作者: MixedEmotions 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
word_stemmer.py 文件源码 项目:allennlp 作者: allenai 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self):
        self.stemmer = NltkPorterStemmer()
porter.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
lyrics_tokenization.py 文件源码 项目:LyricsMoodClassifier 作者: valeriaalampi 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def simple_stemming(tokens):
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(w) for w in tokens]
    return stemmed_tokens
knock73.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def feature_making(sentence):
    stemming = stem.PorterStemmer()
    ans_list = list()
    for word in sentence:
        word = stemming.stem(word)
        if stop(word) == False:
            ans_list.append(word)
    return ans_list
knock72.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def extract_feature(sentence):
    features = defaultdict(lambda: 0)
    stemmer = stem.PorterStemmer()
    for word in sentence.split():
        if not include_stopword(word):
            features[stemmer.stem(word)] += 1
    return features
knock72.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def getFeature(word_list):
    stemmer = stem.LancasterStemmer()
    # stemmer2 = stem.PorterStemmer()
    feature = defaultdict(lambda: 0)
    for word in word_list:
        if not isStopWords(word):
            word_stem = stemmer.stem(word)
            feature[word_stem] += 1
    return dict(feature)
knock76.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def mk_label(line,d,thre):
    import math
    from nltk import stem
    stemmer=stem.PorterStemmer()

    score=0
    for item in line.strip("\n").split(" "):
        item=stemmer.stem(item)
        score+=d[item]
    p_pos=math.exp(score)/(1+math.exp(score))
    if p_pos > thre:
        return(line.split(" ")[0]+"\t+1\t"+str(p_pos))
    else:
        return(line.split(" ")[0]+"\t-1\t"+str(p_pos))
summarizer.py 文件源码 项目:PySummarizer 作者: musikalkemist 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _preprocess(self, text):
        """ Return a list of lists. Each list is a preprocessed sentence of 
            text in bag-of-words format."""

        stemmer = PorterStemmer()
        self._sents = sent_tokenize(text)
        # tokenize sentences
        word_sents = [word_tokenize(sent.lower()) for sent in self._sents]
        # remove stop-words and stem words
        word_sents = [[stemmer.stem(word) for word in sent if 
            word not in self._stopwords] for sent in word_sents]
        return word_sents
porter.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
utils.py 文件源码 项目:tRECS 作者: TeeOhh 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def stemmer(text):
    # '''Description: This function takes in the string of descriptions and return string with all words stemmed
    #   Parameters: String of descriptions
    #   Output: String with all words stemmed (ex. "meeting" and "meetings" to "meeting")'''
    stemmer = PorterStemmer()
    lis = unicode(str(text), 'utf-8').split(" ")
    stemmed_words = [str(stemmer.stem(word)) for word in lis]

    return " ".join(stemmed_words)
porter.py 文件源码 项目:neighborhood_mood_aws 作者: jarrellmark 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
porter.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self, mode=NLTK_EXTENSIONS):
        if mode not in (
            self.NLTK_EXTENSIONS,
            self.MARTIN_EXTENSIONS,
            self.ORIGINAL_ALGORITHM
        ):
            raise ValueError(
                "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
                "PorterStemmer.MARTIN_EXTENSIONS, or "
                "PorterStemmer.ORIGINAL_ALGORITHM"
            )

        self.mode = mode

        if self.mode == self.NLTK_EXTENSIONS:
            # This is a table of irregular forms. It is quite short,
            # but still reflects the errors actually drawn to Martin
            # Porter's attention over a 20 year period!
            irregular_forms = {
                "sky" :     ["sky", "skies"],
                "die" :     ["dying"],
                "lie" :     ["lying"],
                "tie" :     ["tying"],
                "news" :    ["news"],
                "inning" :  ["innings", "inning"],
                "outing" :  ["outings", "outing"],
                "canning" : ["cannings", "canning"],
                "howe" :    ["howe"],
                "proceed" : ["proceed"],
                "exceed"  : ["exceed"],
                "succeed" : ["succeed"],
            }

            self.pool = {}
            for key in irregular_forms:
                for val in irregular_forms[key]:
                    self.pool[val] = key

        self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
porter.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def __repr__(self):
        return '<PorterStemmer>'
knock73.py 文件源码 项目:100knock2017 作者: tmu-nlp 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def feature(sentence):
    features = []
    stemmer = PorterStemmer()
    for word in sentence:
        stem_word = stemmer.stem(word)
        if stopwords(stem_word) == False:
            features.append(stem_word)
    return features
knock72.py 文件源码 项目:100knock2017 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def make_sentence_feature(sen, ids):
    sentence_feature = [0 for i in range(len(ids))]
    words = sen.split()
    for word in words:
        word = stem.PorterStemmer().stem(word)
        if word in ids:
            sentence_feature[ids[word]] += 1
    return sentence_feature
knock52.py 文件源码 项目:100knock2017 作者: tmu-nlp 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def stemming_porter():
    for word_stem in separate_word():
        stemmer = stem.PorterStemmer()
        yield (word_stem,stemmer.stem(word_stem))
porter.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
porter.py 文件源码 项目:beepboop 作者: nicolehe 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
tagger.py 文件源码 项目:teem-tag 作者: P2Pvalue 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self, stemmer=None):
        '''
        @param stemmer: an object or module with a 'stem' method (defaults to
                        stemming.porter2)

        @returns: a new L{Stemmer} object
        '''

        if not stemmer:
            from nltk.stem import PorterStemmer
            stemmer = PorterStemmer()
        self.stemmer = stemmer
porter.py 文件源码 项目:kind2anki 作者: prz3m 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--


问题


面经


文章

微信
公众号

扫码关注公众号