python类PorterStemmer()的实例源码-面圈网

test_util.py 文件源码项目：redbiom 作者: biocore 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def test_stems(self):
        import nltk
        stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS)
        stops = frozenset(nltk.corpus.stopwords.words('english'))
        tests = [("foo bar", ['foo', 'bar']),
                 ("foo $1.23 is the bar", ['foo', 'bar']),
                 ("a b c d", []),  # assume single char stems are useless
                 ("ab cd", ['ab', 'cd']),
                 ("-1.23 1.23 foo", ['foo']),
                 ("-123 foo 123", ['foo']),
                 ("8:12 12:34am foo", ['foo']),
                 ("ab. foo, then bar", ['ab', 'foo', 'bar']),
                 ("crying infants", ["cry", "infant"]),
                 ("drop 12 all 3.45 the 0.123 numbers", ['drop', 'number'])]
        for test, exp in tests:
            obs = list(stems(stops, stemmer, test))
            self.assertEqual(obs, exp)

pre_process.py 文件源码项目：jenova 作者: dungba88 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def clean_text(raw_text, filtered_word_types):
    """Clean raw text for bag-of-words model"""
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)

    # Convert to lower case, split into individual words
    words = letters_only.lower().split()

    # stem words
    stemmer = PorterStemmer()
    stemmed_words = list(map(stemmer.stem, words))

    # Remove stop words if requested
    if filtered_word_types is not None:
        tagged_text = nltk.pos_tag(stemmed_words)
        stemmed_words = [w for w, wtype in tagged_text if not wtype in filtered_word_types]

    # join together
    return " ".join(stemmed_words)

data_preparation_tools.py 文件源码项目：corpus-to-graph-ml 作者: CatalystCode 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def stem_text(sent, context=None):
    processed_tokens = []
    tokens = nltk.word_tokenize(sent)
    porter = nltk.PorterStemmer()
    for t in tokens:
        t = porter.stem(t)
        processed_tokens.append(t)

    return " ".join(processed_tokens)

# Split to train and test sample sets:

task72.py 文件源码项目：nlp100knock 作者: ff9900 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def stemming(sentence):
    st = nltk.PorterStemmer()
    words = [st.stem(word.lower()) for word in re.sub("[\.\,\!\?;\:\(\)\[\]\'\"]$", '', sentence.rstrip()).split()]
    words = [word for word in words if word not in stopwords.words('english')]
    return words

text_utils.py 文件源码项目：document-qa 作者: allenai 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def __init__(self, lower: bool = True, stemmer="port"):
        self.lower = lower
        self.stemmer = stemmer
        if stemmer == "port":
            self._stemmer = PorterStemmer()
            self._stem = self._stemmer.stem
        elif stemmer == "wordnet":
            self._stemmer = WordNetLemmatizer()
            self._stem = self._stemmer.lemmatize
        else:
            raise ValueError(stemmer)
        # stemming is slow, so we cache words as we go
        self.normalize_cache = {}

util.py 文件源码项目：redbiom 作者: biocore 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def df_to_stems(df):
    """Convert a DataFrame to stem -> index associations

    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame to index

    Returns
    -------
    dict
        {stem: {set of indices}}
    """
    from collections import defaultdict
    import functools
    import nltk

    # not using nltk default as we want this to be portable so that, for
    # instance, a javascript library can query
    stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS)

    stops = frozenset(nltk.corpus.stopwords.words('english'))
    stem_f = functools.partial(stems, stops, stemmer)

    d = defaultdict(set)

    for sample, row in df.iterrows():
        for value in row.values:
            for stem in stem_f(value):
                d[stem].add(sample)

    return dict(d)

w2v_distance.py 文件源码项目：Quora-Kaggle 作者: PPshrimpGo 项目源码文件源码阅读 49 收藏 0 点赞 0 评论 0

def str_stemmer(s):
    return " ".join([nltk.PorterStemmer().stem_word(word) for word in s.lower().split()])

edit_distacne.py 文件源码项目：Quora-Kaggle 作者: PPshrimpGo 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def str_stemmer(s):
    return " ".join([nltk.PorterStemmer().stem_word(word) for word in s.lower().split()])

utility.py 文件源码项目：echo 作者: OpenEdition 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def setmword(word):
    return PorterStemmer().stem_word(word)

analyse.py 文件源码项目：Political-Opinion-Finder 作者: philhabell 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def tweetMeaning(self,term):
        self.dbout = self.searcher(term)

        with open("data/words.json") as filedata:
            self.wordList = json.load(filedata)

        threading.Thread(target=self.dis.spinner, args=("Analysing Tweets ",)).start()
        self.tweetList = []
        for self.i in self.dbout:
            self.procounter = 0
            self.negcounter = 0
            for self.word in nltk.word_tokenize(self.i["tweet"]):
                #print("Analysing word: "+self.word)
                try:
                    if nltk.PorterStemmer().stem(self.word) in self.wordList["good"]:
                        #print("Found good world")
                        self.procounter = + 1
                    if nltk.PorterStemmer().stem(self.word) in self.wordList["bad"]:
                        #print("Found bad world")
                        self.negcounter = + 1
                    # if nltk.PorterStemmer().stem(self.word) in self.wordList["swear"]:
                    #     print("Found bad world")
                    #     self.negcounter = + 1
                    else:
                        self.neucounter = + 1
                except IndexError:
                    print("Ignoring tweet:",self.i["tweet"])

            self.view = "unknown"
            if self.procounter > self.negcounter:
                self.view = "pro"
            if self.negcounter > self.procounter:
                self.view = "neg"
            self.tweetDict = {
                "id": self.i["_id"],
                "tweet": self.i["tweet"],
                "procount": self.procounter,
                "negcount": self.negcounter,
                # "view":"pro" if self.procounter > self.negcounter else "neg"
                "view": self.view
            }
            self.tweetList.append(self.tweetDict)
        self.dis.stop()
        return self.tweetList

    # This method gets the poll data from the JSON file it is 
    # stored in, ii then adds them up to get a total.

mmr_summarizer.py 文件源码项目：Text_Summarization-MMR_and_LexRank 作者: syedhope 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def processFile(file_name):

    # read file from provided folder path
    f = open(file_name,'r')
    text_0 = f.read()

    # extract content in TEXT tag and remove tags
    text_1 = re.search(r"<TEXT>.*</TEXT>",text_0, re.DOTALL)
    text_1 = re.sub("<TEXT>\n","",text_1.group(0))
    text_1 = re.sub("\n</TEXT>","",text_1)

    # replace all types of quotations by normal quotes
    text_1 = re.sub("\n"," ",text_1)

    text_1 = re.sub("\"","\"",text_1)
    text_1 = re.sub("''","\"",text_1)
    text_1 = re.sub("``","\"",text_1)   

    text_1 = re.sub(" +"," ",text_1)

    # segment data into a list of sentences
    sentence_token = nltk.data.load('tokenizers/punkt/english.pickle')
    lines = sentence_token.tokenize(text_1.strip()) 

    # setting the stemmer
    sentences = []
    porter = nltk.PorterStemmer()

    # modelling each sentence in file as sentence object
    for line in lines:

        # original words of the sentence before stemming
        originalWords = line[:]
        line = line.strip().lower()

        # word tokenization
        sent = nltk.word_tokenize(line)

        # stemming words
        stemmedSent = [porter.stem(word) for word in sent]      
        stemmedSent = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'" 
            and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmedSent)

        # list of sentence objects
        if stemmedSent != []:
            sentences.append(sentence.sentence(file_name, stemmedSent, originalWords))              

    return sentences

#---------------------------------------------------------------------------------
# Description   : Function to find the term frequencies of the words in the
#                 sentences present in the provided document cluster
# Parameters    : sentences, sentences of the document cluster
# Return        : dictonary of word, term frequency score
#---------------------------------------------------------------------------------

LexRank.py 文件源码项目：Text_Summarization-MMR_and_LexRank 作者: syedhope 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def processFile(self, file_path_and_name):
        try:

            f = open(file_path_and_name,'rb')
            text = f.read()

            # soup = BeautifulSoup(text,"html.parser")
            # text = soup.getText()
            # text = re.sub("APW19981212.0848","",text)
            # text = re.sub("APW19981129.0668","",text)
            # text = re.sub("NEWSWIRE","",text)
            text_1 = re.search(r"<TEXT>.*</TEXT>",text, re.DOTALL)
            text_1 = re.sub("<TEXT>\n","",text_1.group(0))
            text_1 = re.sub("\n</TEXT>","",text_1)

            # replace all types of quotations by normal quotes
            text_1 = re.sub("\n"," ",text_1)
            text_1 = re.sub(" +"," ",text_1)
            # text_1 = re.sub("\'\'","\"",text_1)
            # text_1 = re.sub("\`\`","\"",text_1)


            sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

            lines = sent_tokenizer.tokenize(text_1.strip())
            text_1 = lines

            sentences = []
            porter = nltk.PorterStemmer()

            for sent in lines:
                OG_sent = sent[:]
                sent = sent.strip().lower()
                line = nltk.word_tokenize(sent)

                stemmed_sentence = [porter.stem(word) for word in line]
                stemmed_sentence = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'"
                                    and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmed_sentence)
                if stemmed_sentence != []:
                    sentences.append(sentence(file_path_and_name, stemmed_sentence, OG_sent))

            return sentences


        except IOError:
            print 'Oops! File not found',file_path_and_name
            return [sentence(file_path_and_name, [],[])]

baseline.py 文件源码项目：Quora-Kaggle 作者: PPshrimpGo 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def get_features(df_features):
    # now = datetime.datetime.now()
    # print now.strftime('%Y-%m-%d %H:%M:%S') 
    # print "matchnouns"
    # df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    # df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    # #df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)  #takes long
    # df_features['z_noun_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_nouns, r.question2_nouns), axis = 1)

    # now = datetime.datetime.now()
    # print now.strftime('%Y-%m-%d %H:%M:%S')   
    # print "matchverb"
    # df_features['question1_verbs'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B'])
    # df_features['question2_verbs'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B'])
    # #df_features['z_verb_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_verbs if w in r.question2_verbs]), axis=1)  #takes long
    # df_features['z_verb_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_verbs, r.question2_verbs), axis = 1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    print "stem_tfidf"
    df_features['q1_stem'] = df_features.question1.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')])
    df_features['q2_stem'] = df_features.question2.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')])
    #df_features['z_adj_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_adjs if w in r.question2_adjs]), axis=1)  #takes long
    df_features['z_stem_tfidf'] = df_features.apply(lambda r : tfidf_word_match_share(r.q1_stem, r.q2_stem), axis = 1)
    now = datetime.datetime.now()
    # print now.strftime('%Y-%m-%d %H:%M:%S')
    # print('w2v tfidf...')
    # df_features['z_tfidf_w2v'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1.tolist(), r.question2.tolist()), axis = 1)
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    print('nouns...')
    df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)  #takes long
    print('lengths...')
    df_features['z_len1'] = df_features.question1.map(lambda x: len(str(x)))
    df_features['z_len2'] = df_features.question2.map(lambda x: len(str(x)))
    df_features['z_word_len1'] = df_features.question1.map(lambda x: len(str(x).split()))
    df_features['z_word_len2'] = df_features.question2.map(lambda x: len(str(x).split()))
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    print('difflib...')
    df_features['z_match_ratio'] = df_features.apply(lambda r: diff_ratios(r.question1, r.question2), axis=1)  #takes long
    print('word match...')
    df_features['z_word_match'] = df_features.apply(word_match_share, axis=1, raw=True)
    print('tfidf...')
    df_features['z_tfidf_sum1'] = df_features.question1.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_sum2'] = df_features.question2.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_mean1'] = df_features.question1.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_mean2'] = df_features.question2.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_len1'] = df_features.question1.map(lambda x: len(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_len2'] = df_features.question2.map(lambda x: len(tfidf.transform([str(x)]).data))
    return df_features.fillna(0.0)