python类stem()的实例源码

utils.py 文件源码 项目:tRECS 作者: TeeOhh 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def master_clean(df, column, html, email, punc, non_ascii, stopwords, number, remove_nonenglish, stemorlem):
    if punc:
        df[column] = df[column].apply(remove_punc).to_frame()
    if html:
        df[column] = df[column].apply(remove_html).to_frame()
    if email:
        df[column] = df[column].apply(remove_email).to_frame()
    if non_ascii:
        df[column] = df[column].apply(remove_non_ascii).to_frame()
    if stopwords:
        df[column] = df[column].apply(remove_stop).to_frame()
    if number:
        df[column] = df[column].apply(remove_numbers).to_frame()
    if nonenglish:
        df[column] = df[column].apply(nonenglish).to_frame()
    if stemorlem == 'stem':
        df[column] = df[column].apply(stemmer).to_frame()
    elif stemorlem == 'lem':
        df[column] = df[column].apply(lemmatizer).to_frame()

    return df
final_crime_result.py 文件源码 项目:goal 作者: victorskl 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def preprocess(content):
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

    words_set = []
    for twitter in content:
        words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
    words_set = list(set(words_set))

    stop_words = stopwords.words('english')
    non_words = list(punctuation)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    # only need the alphabetic word
    formartted_twitter_words_set = []
    for word in words_set:
        if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
            formartted_twitter_words_set.append(lemmatizer.lemmatize(word))

    nltk_words_set = list(set(nltk.corpus.words.words()))
    # training whole set
    training_set = formartted_twitter_words_set + nltk_words_set
    return training_set
crime.py 文件源码 项目:goal 作者: victorskl 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def preprocess(content):
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

    words_set = []
    for twitter in content:
        words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
    words_set = list(set(words_set))

    stop_words = stopwords.words('english')
    non_words = list(punctuation)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    # only need the alphabetic word
    formartted_twitter_words_set = []
    for word in words_set:
        if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
            formartted_twitter_words_set.append(lemmatizer.lemmatize(word))

    nltk_words_set = list(set(nltk.corpus.words.words()))
    # training whole set
    training_set = formartted_twitter_words_set + nltk_words_set
    return training_set
process_both.py 文件源码 项目:disaster-mitigation 作者: varun-manjunath 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def resource_similarity_score_via_word_net_1(need_res_set,offer_tweet_list):
    if len(need_res_set)==0:
        return 0
    value=0
    offer_res_list=[]
    for i in offer_tweet_list:
        for j in i.split():
            if stemmer.stem(j.lower()) not in out_stem_list:
                offer_res_list.append(stemmer.stem(j.lower()))

    for word in need_res_set:
        temp= get_similarity_score_1(word,offer_res_list)
        if temp > 0.6:
            value=value+temp

    return value/len(need_res_set)
tagger.py 文件源码 项目:teem-tag 作者: P2Pvalue 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, string, stem=None, rating=1.0, proper=False,
                 terminal=False):
        '''
        @param string:   the actual representation of the tag
        @param stem:     the internal (usually stemmed) representation;
                         tags with the same stem are regarded as equal
        @param rating:   a measure of the tag's relevance in the interval [0,1]
        @param proper:   whether the tag is a proper noun
        @param terminal: set to True if the tag is at the end of a phrase
                         (or anyway it cannot be logically merged to the
                         following one)

        @returns: a new L{Tag} object
        '''

        self.string  = string
        self.stem = stem or string
        self.rating = rating
        self.proper = proper
        self.terminal = terminal
tagger.py 文件源码 项目:teem-tag 作者: P2Pvalue 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self, tail, head=None):
        '''
        @param tail: the L{Tag} object to add to the first part (head)
        @param head: the (eventually absent) L{MultiTag} to be extended

        @returns: a new L{MultiTag} object
        '''

        if not head:
            Tag.__init__(self, tail.string, tail.stem, tail.rating,
                         tail.proper, tail.terminal)
            self.size = 1
            self.subratings = [self.rating]
        else:
            self.string = ' '.join([head.string, tail.string])
            self.stem = ' '.join([head.stem, tail.stem])
            self.size = head.size + 1

            self.proper = (head.proper and tail.proper)
            self.terminal = tail.terminal

            self.subratings = head.subratings + [tail.rating]
            self.rating = self.combined_rating()
__chatcheck.py 文件源码 项目:PYSHA 作者: shafaypro 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def respond(sentences):
    tokenized_sentence = sent_tokenize(sentences)
    stop_words = set(stopwords.words("english"))  # Getting the stop words from the Local DB
    if len(tokenized_sentence) > 1:  # if the length of the tokenized sentence is greater than one

        # for sentence in tokenized_sentence:
        #     words = word_tokenize(sentence)  # Each word is tokenized
            pos_tagged = parts_of_speechtag(sentences)
            print(tuple(pos_tagged))
            # filtered_words = [w for w in words if w not in stop_words]  # removing the additional stop words for
            # portStemer_object = PorterStemmer()
            # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
            # return filtered_steam_words
    else:
        pos_tagged = parts_of_speechtag(sentences)
        print(type(pos_tagged))
        # words = word_tokenize(sentences)
        # filtered_words = [w for w in words if w not in stop_words]
        # portStemer_object = PorterStemmer()
        # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
        #return filtered_steam_words
__chatcheck.py 文件源码 项目:PYSHA 作者: shafaypro 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def respond(sentences):
    tokenized_sentence = sent_tokenize(sentences)
    stop_words = set(stopwords.words("english"))  # Getting the stop words from the Local DB
    if len(tokenized_sentence) > 1:  # if the length of the tokenized sentence is greater than one

        # for sentence in tokenized_sentence:
        #     words = word_tokenize(sentence)  # Each word is tokenized
            pos_tagged = parts_of_speechtag(sentences)
            print(tuple(pos_tagged))
            # filtered_words = [w for w in words if w not in stop_words]  # removing the additional stop words for
            # portStemer_object = PorterStemmer()
            # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
            # return filtered_steam_words
    else:
        pos_tagged = parts_of_speechtag(sentences)
        print(type(pos_tagged))
        # words = word_tokenize(sentences)
        # filtered_words = [w for w in words if w not in stop_words]
        # portStemer_object = PorterStemmer()
        # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
        #return filtered_steam_words
nlp_utils.py 文件源码 项目:search_relevance 作者: rmanak 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def stem(self,word,pos=u'n'):
        return self.lemmatize(word,pos)


########  Wrapper for all  of the popular stemmers ###########
nlp_utils.py 文件源码 项目:search_relevance 作者: rmanak 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self,stemmer_type):
        self.stemmer_type = stemmer_type
        if (self.stemmer_type == 'porter'):
            self.stemmer = nltk.stem.PorterStemmer()
        elif (self.stemmer_type == 'snowball'):
            self.stemmer = nltk.stem.SnowballStemmer('english')
        elif (self.stemmer_type == 'lemmatize'):
            self.stemmer = WordNetStemmer()
        else:
            raise NameError("'"+stemmer_type +"'" + " not supported")



######## Simple wordreplacer object using a dictionary  ############
nlp_utils.py 文件源码 项目:search_relevance 作者: rmanak 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def normalize(self, text):
        return [self.stemmer.stem(token) 
                for token in self.tokenizer.tokenize(text.lower()) 
                if token not in self.stop_words]

######### defining a default normalizer ##########
nlp_utils.py 文件源码 项目:search_relevance 作者: rmanak 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))



########## Stemmer + CountVectorizer wrapper #############
nlp_utils.py 文件源码 项目:search_relevance 作者: rmanak 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))


########## Defaults TF-IDF & Count Vectorizers ########


#======== TF-IDF Vectorizer =========#
PreSignature.py 文件源码 项目:PPRE 作者: MaoYuwei 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def Stem(self):
        #????
        fin = open('../file/pos_signature.txt', 'r')
        fout = open('../file/stem_signature.txt', 'w+')
        while True:
            line = fin.readline()
            if line:
                if '***' in line:
                    fout.write(line)
                elif '---------' in line:
                    fout.write(line)
                else:
                    num, line = line.split(':', 1)
                    line = self.RemSingleWord(line)#???????
                    line = self.CleanStopWords(line)#????
                    line = self.CleanLines(line)#???
                    line = line.split()
                    word_list = []
                    s = nltk.stem.SnowballStemmer('english')
                    for w in line:
                        w = s.stem(w)
                        word_list.append(w)
                    line = ' '.join(word_list)
                    fout.write(num + ':' + line + '\n')
            else:
                break
documents.py 文件源码 项目:MOQA 作者: pprakhar30 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self, itemId, questionType, answerType, question, answer, V, WordIDMap):

        self.itemId         = itemId
        self.questionType   = questionType
        self.answerType     = answerType
        self.question       = question
        self.answer         = answer
        self.Question       = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(question) if stemmer.stem(word) in WordIDMap]
        self.Answer         = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(answer) if stemmer.stem(word) in WordIDMap]
        self.qFeature       = {}
        self.aFeature       = {}
        self.create_QAFeature()
documents.py 文件源码 项目:MOQA 作者: pprakhar30 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self, itemId, Review, V, WordIDMap, ReviewObj):

        self.itemId     = itemId
        self.sent   = Review
        self.rObj   = ReviewObj
        self.Sent   = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(Review) if stemmer.stem(word) in WordIDMap]
        self.sFeature   = {}
html2es.py 文件源码 项目:texta 作者: texta-tk 项目源码 文件源码 阅读 86 收藏 0 点赞 0 评论 0
def get_lemma_sentences(sentences):
    lemma_sentences = []
    for s in sentences:
        words = [w for w in nltk.word_tokenize(s) if w]
        w_s = [stemmer.stem(w) for w in words]
        l_s = ' '.join(w_s)
        lemma_sentences.append(l_s)
    return lemma_sentences
comment_processing.py 文件源码 项目:hoot 作者: CatalystOfNostalgia 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def tokenizeDocument(document):
    # remove punctuation (otherwise we have a bunch of empty tokens at the end)
    translate_table = dict((ord(char), " ") for char in string.punctuation)
    document = document.translate(translate_table)
    # tokenize
    tokenized_doc = nltk.word_tokenize(document)
    # stem
    snowball = stem.snowball.EnglishStemmer()
    tokenized_doc = [snowball.stem(word) for word in tokenized_doc]
    # remove stop words
    tokenized_doc = [word for word in tokenized_doc if word not in stopwords.words('english')]
    return tokenized_doc

# given the dictionary, return an array of all the tokenized comments
utils.py 文件源码 项目:tRECS 作者: TeeOhh 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def stemmer(text):
    # '''Description: This function takes in the string of descriptions and return string with all words stemmed
    #   Parameters: String of descriptions
    #   Output: String with all words stemmed (ex. "meeting" and "meetings" to "meeting")'''
    stemmer = PorterStemmer()
    lis = unicode(str(text), 'utf-8').split(" ")
    stemmed_words = [str(stemmer.stem(word)) for word in lis]

    return " ".join(stemmed_words)
ner_similarity.py 文件源码 项目:QuestionAnswerNLP 作者: debjyoti385 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def extract_keywords(text):
    tokens = [i.lower() for i in nltk.word_tokenize(text) if i not in stop_words ]
    pos_tagged_tokens = nltk.pos_tag(tokens)
    result=[]
    for token in pos_tagged_tokens:
        # print token
        if token[1] in  POS_KEYS:
            result.append(token[0])

    return [ ps.stem(w) for w in result]
sourceContentSelector.py 文件源码 项目:QuestionAnswerNLP 作者: debjyoti385 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def getKeywords(question):
  tagged = nltk.tag.pos_tag(question)
  tagged = [pair for pair in tagged if pair[1] in key_POS and pair[0].lower() not in aux]
  return {ps.stem(tag[0]) for tag in tagged}

# Given a question, return a list of each sentence in the article
# with a score attached to it
sourceContentSelector.py 文件源码 项目:QuestionAnswerNLP 作者: debjyoti385 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def score(question, sentence):
    score = 0
    sentence = map(ps.stem, sentence)
    keywords = getKeywords(question)
    question = map(ps.stem, question)
    score += proximity(keywords, sentence)
    question_ngrams = count_ngrams(question, MAX_NGRAMS, True)
    sentence_ngrams = count_ngrams(sentence, MAX_NGRAMS, True)
    precision, recall = bleu_score(question_ngrams, len(question), sentence_ngrams, len(sentence), 5)
    f1 = (2*precision*recall)/(precision+recall)
    score += 2*f1
    return score

# Finds the shortest window in the targest sentence
# in which all keywords appear, and assigns a score.
negation_detection.py 文件源码 项目:negation-detection 作者: gkotsis 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def _stem_(s):
    from nltk.stem.lancaster import LancasterStemmer
    rs = LancasterStemmer()
    rs = rs.stem(s)
    return rs
negation_detection.py 文件源码 项目:negation-detection 作者: gkotsis 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def _lemma_(token):

    if isinstance(token, str):
        return _stem_(token)
    if isinstance(token, unicode):
        return _stem_(token)
    from nltk.corpus import wordnet

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    p = get_wordnet_pos(token.pos()[0][1])
    if p!=wordnet.VERB:
        return _stem_(token[0])
    rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
    return rs
sentiment.py 文件源码 项目:fake_news 作者: bmassman 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def stem_text(text):
    from nltk.stem import LancasterStemmer
    ls = LancasterStemmer()
    tokens = tokenize_text(text)
    filtered_tokens = [ls.stem(token) for token in tokens]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
load.py 文件源码 项目:graph-based-semi-supervised-learning 作者: deerishi 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]
generate_pattern_mining.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def stem_str(sen):
    sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
    sen = nltk.word_tokenize(sen.lower())
    sen = map(snowball_stemmer.stem, sen)
    sen = map(wordnet_lemmatizer.lemmatize, sen)
    return (' '.join(sen)).lower()
generate_pattern_mining.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def stem_str(sen):
    sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
    sen = nltk.word_tokenize(sen.lower())
    sen = map(snowball_stemmer.stem, sen)
    sen = map(wordnet_lemmatizer.lemmatize, sen)
    return (' '.join(sen)).lower()
pattern_mining.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def stem_str(sen):
    sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
    sen = nltk.word_tokenize(sen.lower())
    sen = map(snowball_stemmer.stem, sen)
    sen = map(wordnet_lemmatizer.lemmatize, sen)
    return (' '.join(sen)).lower()
mi.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def stem_str(sen):
    sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
    sen = nltk.word_tokenize(sen.lower())
    sen = map(snowball_stemmer.stem, sen)
    sen = map(wordnet_lemmatizer.lemmatize, sen)
    return (' '.join(sen)).lower()


问题


面经


文章

微信
公众号

扫码关注公众号