def master_clean(df, column, html, email, punc, non_ascii, stopwords, number, remove_nonenglish, stemorlem):
if punc:
df[column] = df[column].apply(remove_punc).to_frame()
if html:
df[column] = df[column].apply(remove_html).to_frame()
if email:
df[column] = df[column].apply(remove_email).to_frame()
if non_ascii:
df[column] = df[column].apply(remove_non_ascii).to_frame()
if stopwords:
df[column] = df[column].apply(remove_stop).to_frame()
if number:
df[column] = df[column].apply(remove_numbers).to_frame()
if nonenglish:
df[column] = df[column].apply(nonenglish).to_frame()
if stemorlem == 'stem':
df[column] = df[column].apply(stemmer).to_frame()
elif stemorlem == 'lem':
df[column] = df[column].apply(lemmatizer).to_frame()
return df
python类stem()的实例源码
def preprocess(content):
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
words_set = []
for twitter in content:
words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
words_set = list(set(words_set))
stop_words = stopwords.words('english')
non_words = list(punctuation)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
# only need the alphabetic word
formartted_twitter_words_set = []
for word in words_set:
if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
formartted_twitter_words_set.append(lemmatizer.lemmatize(word))
nltk_words_set = list(set(nltk.corpus.words.words()))
# training whole set
training_set = formartted_twitter_words_set + nltk_words_set
return training_set
def preprocess(content):
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
words_set = []
for twitter in content:
words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
words_set = list(set(words_set))
stop_words = stopwords.words('english')
non_words = list(punctuation)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
# only need the alphabetic word
formartted_twitter_words_set = []
for word in words_set:
if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
formartted_twitter_words_set.append(lemmatizer.lemmatize(word))
nltk_words_set = list(set(nltk.corpus.words.words()))
# training whole set
training_set = formartted_twitter_words_set + nltk_words_set
return training_set
def resource_similarity_score_via_word_net_1(need_res_set,offer_tweet_list):
if len(need_res_set)==0:
return 0
value=0
offer_res_list=[]
for i in offer_tweet_list:
for j in i.split():
if stemmer.stem(j.lower()) not in out_stem_list:
offer_res_list.append(stemmer.stem(j.lower()))
for word in need_res_set:
temp= get_similarity_score_1(word,offer_res_list)
if temp > 0.6:
value=value+temp
return value/len(need_res_set)
def __init__(self, string, stem=None, rating=1.0, proper=False,
terminal=False):
'''
@param string: the actual representation of the tag
@param stem: the internal (usually stemmed) representation;
tags with the same stem are regarded as equal
@param rating: a measure of the tag's relevance in the interval [0,1]
@param proper: whether the tag is a proper noun
@param terminal: set to True if the tag is at the end of a phrase
(or anyway it cannot be logically merged to the
following one)
@returns: a new L{Tag} object
'''
self.string = string
self.stem = stem or string
self.rating = rating
self.proper = proper
self.terminal = terminal
def __init__(self, tail, head=None):
'''
@param tail: the L{Tag} object to add to the first part (head)
@param head: the (eventually absent) L{MultiTag} to be extended
@returns: a new L{MultiTag} object
'''
if not head:
Tag.__init__(self, tail.string, tail.stem, tail.rating,
tail.proper, tail.terminal)
self.size = 1
self.subratings = [self.rating]
else:
self.string = ' '.join([head.string, tail.string])
self.stem = ' '.join([head.stem, tail.stem])
self.size = head.size + 1
self.proper = (head.proper and tail.proper)
self.terminal = tail.terminal
self.subratings = head.subratings + [tail.rating]
self.rating = self.combined_rating()
def respond(sentences):
tokenized_sentence = sent_tokenize(sentences)
stop_words = set(stopwords.words("english")) # Getting the stop words from the Local DB
if len(tokenized_sentence) > 1: # if the length of the tokenized sentence is greater than one
# for sentence in tokenized_sentence:
# words = word_tokenize(sentence) # Each word is tokenized
pos_tagged = parts_of_speechtag(sentences)
print(tuple(pos_tagged))
# filtered_words = [w for w in words if w not in stop_words] # removing the additional stop words for
# portStemer_object = PorterStemmer()
# filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
# return filtered_steam_words
else:
pos_tagged = parts_of_speechtag(sentences)
print(type(pos_tagged))
# words = word_tokenize(sentences)
# filtered_words = [w for w in words if w not in stop_words]
# portStemer_object = PorterStemmer()
# filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
#return filtered_steam_words
def respond(sentences):
tokenized_sentence = sent_tokenize(sentences)
stop_words = set(stopwords.words("english")) # Getting the stop words from the Local DB
if len(tokenized_sentence) > 1: # if the length of the tokenized sentence is greater than one
# for sentence in tokenized_sentence:
# words = word_tokenize(sentence) # Each word is tokenized
pos_tagged = parts_of_speechtag(sentences)
print(tuple(pos_tagged))
# filtered_words = [w for w in words if w not in stop_words] # removing the additional stop words for
# portStemer_object = PorterStemmer()
# filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
# return filtered_steam_words
else:
pos_tagged = parts_of_speechtag(sentences)
print(type(pos_tagged))
# words = word_tokenize(sentences)
# filtered_words = [w for w in words if w not in stop_words]
# portStemer_object = PorterStemmer()
# filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
#return filtered_steam_words
def stem(self,word,pos=u'n'):
return self.lemmatize(word,pos)
######## Wrapper for all of the popular stemmers ###########
def __init__(self,stemmer_type):
self.stemmer_type = stemmer_type
if (self.stemmer_type == 'porter'):
self.stemmer = nltk.stem.PorterStemmer()
elif (self.stemmer_type == 'snowball'):
self.stemmer = nltk.stem.SnowballStemmer('english')
elif (self.stemmer_type == 'lemmatize'):
self.stemmer = WordNetStemmer()
else:
raise NameError("'"+stemmer_type +"'" + " not supported")
######## Simple wordreplacer object using a dictionary ############
def normalize(self, text):
return [self.stemmer.stem(token)
for token in self.tokenizer.tokenize(text.lower())
if token not in self.stop_words]
######### defining a default normalizer ##########
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
########## Stemmer + CountVectorizer wrapper #############
def build_analyzer(self):
analyzer = super(CountVectorizer, self).build_analyzer()
return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
########## Defaults TF-IDF & Count Vectorizers ########
#======== TF-IDF Vectorizer =========#
def Stem(self):
#????
fin = open('../file/pos_signature.txt', 'r')
fout = open('../file/stem_signature.txt', 'w+')
while True:
line = fin.readline()
if line:
if '***' in line:
fout.write(line)
elif '---------' in line:
fout.write(line)
else:
num, line = line.split(':', 1)
line = self.RemSingleWord(line)#???????
line = self.CleanStopWords(line)#????
line = self.CleanLines(line)#???
line = line.split()
word_list = []
s = nltk.stem.SnowballStemmer('english')
for w in line:
w = s.stem(w)
word_list.append(w)
line = ' '.join(word_list)
fout.write(num + ':' + line + '\n')
else:
break
def __init__(self, itemId, questionType, answerType, question, answer, V, WordIDMap):
self.itemId = itemId
self.questionType = questionType
self.answerType = answerType
self.question = question
self.answer = answer
self.Question = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(question) if stemmer.stem(word) in WordIDMap]
self.Answer = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(answer) if stemmer.stem(word) in WordIDMap]
self.qFeature = {}
self.aFeature = {}
self.create_QAFeature()
def __init__(self, itemId, Review, V, WordIDMap, ReviewObj):
self.itemId = itemId
self.sent = Review
self.rObj = ReviewObj
self.Sent = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(Review) if stemmer.stem(word) in WordIDMap]
self.sFeature = {}
def get_lemma_sentences(sentences):
lemma_sentences = []
for s in sentences:
words = [w for w in nltk.word_tokenize(s) if w]
w_s = [stemmer.stem(w) for w in words]
l_s = ' '.join(w_s)
lemma_sentences.append(l_s)
return lemma_sentences
def tokenizeDocument(document):
# remove punctuation (otherwise we have a bunch of empty tokens at the end)
translate_table = dict((ord(char), " ") for char in string.punctuation)
document = document.translate(translate_table)
# tokenize
tokenized_doc = nltk.word_tokenize(document)
# stem
snowball = stem.snowball.EnglishStemmer()
tokenized_doc = [snowball.stem(word) for word in tokenized_doc]
# remove stop words
tokenized_doc = [word for word in tokenized_doc if word not in stopwords.words('english')]
return tokenized_doc
# given the dictionary, return an array of all the tokenized comments
def stemmer(text):
# '''Description: This function takes in the string of descriptions and return string with all words stemmed
# Parameters: String of descriptions
# Output: String with all words stemmed (ex. "meeting" and "meetings" to "meeting")'''
stemmer = PorterStemmer()
lis = unicode(str(text), 'utf-8').split(" ")
stemmed_words = [str(stemmer.stem(word)) for word in lis]
return " ".join(stemmed_words)
def extract_keywords(text):
tokens = [i.lower() for i in nltk.word_tokenize(text) if i not in stop_words ]
pos_tagged_tokens = nltk.pos_tag(tokens)
result=[]
for token in pos_tagged_tokens:
# print token
if token[1] in POS_KEYS:
result.append(token[0])
return [ ps.stem(w) for w in result]
def getKeywords(question):
tagged = nltk.tag.pos_tag(question)
tagged = [pair for pair in tagged if pair[1] in key_POS and pair[0].lower() not in aux]
return {ps.stem(tag[0]) for tag in tagged}
# Given a question, return a list of each sentence in the article
# with a score attached to it
def score(question, sentence):
score = 0
sentence = map(ps.stem, sentence)
keywords = getKeywords(question)
question = map(ps.stem, question)
score += proximity(keywords, sentence)
question_ngrams = count_ngrams(question, MAX_NGRAMS, True)
sentence_ngrams = count_ngrams(sentence, MAX_NGRAMS, True)
precision, recall = bleu_score(question_ngrams, len(question), sentence_ngrams, len(sentence), 5)
f1 = (2*precision*recall)/(precision+recall)
score += 2*f1
return score
# Finds the shortest window in the targest sentence
# in which all keywords appear, and assigns a score.
def _stem_(s):
from nltk.stem.lancaster import LancasterStemmer
rs = LancasterStemmer()
rs = rs.stem(s)
return rs
def _lemma_(token):
if isinstance(token, str):
return _stem_(token)
if isinstance(token, unicode):
return _stem_(token)
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
p = get_wordnet_pos(token.pos()[0][1])
if p!=wordnet.VERB:
return _stem_(token[0])
rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
return rs
def stem_text(text):
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
tokens = tokenize_text(text)
filtered_tokens = [ls.stem(token) for token in tokens]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def __call__(self, doc):
return [self.stemmer.stem(t) for t in word_tokenize(doc)]
generate_pattern_mining.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def stem_str(sen):
sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
sen = nltk.word_tokenize(sen.lower())
sen = map(snowball_stemmer.stem, sen)
sen = map(wordnet_lemmatizer.lemmatize, sen)
return (' '.join(sen)).lower()
generate_pattern_mining.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def stem_str(sen):
sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
sen = nltk.word_tokenize(sen.lower())
sen = map(snowball_stemmer.stem, sen)
sen = map(wordnet_lemmatizer.lemmatize, sen)
return (' '.join(sen)).lower()
def stem_str(sen):
sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
sen = nltk.word_tokenize(sen.lower())
sen = map(snowball_stemmer.stem, sen)
sen = map(wordnet_lemmatizer.lemmatize, sen)
return (' '.join(sen)).lower()
def stem_str(sen):
sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
sen = nltk.word_tokenize(sen.lower())
sen = map(snowball_stemmer.stem, sen)
sen = map(wordnet_lemmatizer.lemmatize, sen)
return (' '.join(sen)).lower()