def test_stems(self):
import nltk
stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS)
stops = frozenset(nltk.corpus.stopwords.words('english'))
tests = [("foo bar", ['foo', 'bar']),
("foo $1.23 is the bar", ['foo', 'bar']),
("a b c d", []), # assume single char stems are useless
("ab cd", ['ab', 'cd']),
("-1.23 1.23 foo", ['foo']),
("-123 foo 123", ['foo']),
("8:12 12:34am foo", ['foo']),
("ab. foo, then bar", ['ab', 'foo', 'bar']),
("crying infants", ["cry", "infant"]),
("drop 12 all 3.45 the 0.123 numbers", ['drop', 'number'])]
for test, exp in tests:
obs = list(stems(stops, stemmer, test))
self.assertEqual(obs, exp)
python类PorterStemmer()的实例源码
def clean_text(raw_text, filtered_word_types):
"""Clean raw text for bag-of-words model"""
# Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
# Convert to lower case, split into individual words
words = letters_only.lower().split()
# stem words
stemmer = PorterStemmer()
stemmed_words = list(map(stemmer.stem, words))
# Remove stop words if requested
if filtered_word_types is not None:
tagged_text = nltk.pos_tag(stemmed_words)
stemmed_words = [w for w, wtype in tagged_text if not wtype in filtered_word_types]
# join together
return " ".join(stemmed_words)
data_preparation_tools.py 文件源码
项目:corpus-to-graph-ml
作者: CatalystCode
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def stem_text(sent, context=None):
processed_tokens = []
tokens = nltk.word_tokenize(sent)
porter = nltk.PorterStemmer()
for t in tokens:
t = porter.stem(t)
processed_tokens.append(t)
return " ".join(processed_tokens)
# Split to train and test sample sets:
def stemming(sentence):
st = nltk.PorterStemmer()
words = [st.stem(word.lower()) for word in re.sub("[\.\,\!\?;\:\(\)\[\]\'\"]$", '', sentence.rstrip()).split()]
words = [word for word in words if word not in stopwords.words('english')]
return words
def __init__(self, lower: bool = True, stemmer="port"):
self.lower = lower
self.stemmer = stemmer
if stemmer == "port":
self._stemmer = PorterStemmer()
self._stem = self._stemmer.stem
elif stemmer == "wordnet":
self._stemmer = WordNetLemmatizer()
self._stem = self._stemmer.lemmatize
else:
raise ValueError(stemmer)
# stemming is slow, so we cache words as we go
self.normalize_cache = {}
def df_to_stems(df):
"""Convert a DataFrame to stem -> index associations
Parameters
----------
df : pd.DataFrame
A pandas DataFrame to index
Returns
-------
dict
{stem: {set of indices}}
"""
from collections import defaultdict
import functools
import nltk
# not using nltk default as we want this to be portable so that, for
# instance, a javascript library can query
stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS)
stops = frozenset(nltk.corpus.stopwords.words('english'))
stem_f = functools.partial(stems, stops, stemmer)
d = defaultdict(set)
for sample, row in df.iterrows():
for value in row.values:
for stem in stem_f(value):
d[stem].add(sample)
return dict(d)
def str_stemmer(s):
return " ".join([nltk.PorterStemmer().stem_word(word) for word in s.lower().split()])
def str_stemmer(s):
return " ".join([nltk.PorterStemmer().stem_word(word) for word in s.lower().split()])
def setmword(word):
return PorterStemmer().stem_word(word)
def tweetMeaning(self,term):
self.dbout = self.searcher(term)
with open("data/words.json") as filedata:
self.wordList = json.load(filedata)
threading.Thread(target=self.dis.spinner, args=("Analysing Tweets ",)).start()
self.tweetList = []
for self.i in self.dbout:
self.procounter = 0
self.negcounter = 0
for self.word in nltk.word_tokenize(self.i["tweet"]):
#print("Analysing word: "+self.word)
try:
if nltk.PorterStemmer().stem(self.word) in self.wordList["good"]:
#print("Found good world")
self.procounter = + 1
if nltk.PorterStemmer().stem(self.word) in self.wordList["bad"]:
#print("Found bad world")
self.negcounter = + 1
# if nltk.PorterStemmer().stem(self.word) in self.wordList["swear"]:
# print("Found bad world")
# self.negcounter = + 1
else:
self.neucounter = + 1
except IndexError:
print("Ignoring tweet:",self.i["tweet"])
self.view = "unknown"
if self.procounter > self.negcounter:
self.view = "pro"
if self.negcounter > self.procounter:
self.view = "neg"
self.tweetDict = {
"id": self.i["_id"],
"tweet": self.i["tweet"],
"procount": self.procounter,
"negcount": self.negcounter,
# "view":"pro" if self.procounter > self.negcounter else "neg"
"view": self.view
}
self.tweetList.append(self.tweetDict)
self.dis.stop()
return self.tweetList
# This method gets the poll data from the JSON file it is
# stored in, ii then adds them up to get a total.
mmr_summarizer.py 文件源码
项目:Text_Summarization-MMR_and_LexRank
作者: syedhope
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def processFile(file_name):
# read file from provided folder path
f = open(file_name,'r')
text_0 = f.read()
# extract content in TEXT tag and remove tags
text_1 = re.search(r"<TEXT>.*</TEXT>",text_0, re.DOTALL)
text_1 = re.sub("<TEXT>\n","",text_1.group(0))
text_1 = re.sub("\n</TEXT>","",text_1)
# replace all types of quotations by normal quotes
text_1 = re.sub("\n"," ",text_1)
text_1 = re.sub("\"","\"",text_1)
text_1 = re.sub("''","\"",text_1)
text_1 = re.sub("``","\"",text_1)
text_1 = re.sub(" +"," ",text_1)
# segment data into a list of sentences
sentence_token = nltk.data.load('tokenizers/punkt/english.pickle')
lines = sentence_token.tokenize(text_1.strip())
# setting the stemmer
sentences = []
porter = nltk.PorterStemmer()
# modelling each sentence in file as sentence object
for line in lines:
# original words of the sentence before stemming
originalWords = line[:]
line = line.strip().lower()
# word tokenization
sent = nltk.word_tokenize(line)
# stemming words
stemmedSent = [porter.stem(word) for word in sent]
stemmedSent = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'"
and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmedSent)
# list of sentence objects
if stemmedSent != []:
sentences.append(sentence.sentence(file_name, stemmedSent, originalWords))
return sentences
#---------------------------------------------------------------------------------
# Description : Function to find the term frequencies of the words in the
# sentences present in the provided document cluster
# Parameters : sentences, sentences of the document cluster
# Return : dictonary of word, term frequency score
#---------------------------------------------------------------------------------
def processFile(self, file_path_and_name):
try:
f = open(file_path_and_name,'rb')
text = f.read()
# soup = BeautifulSoup(text,"html.parser")
# text = soup.getText()
# text = re.sub("APW19981212.0848","",text)
# text = re.sub("APW19981129.0668","",text)
# text = re.sub("NEWSWIRE","",text)
text_1 = re.search(r"<TEXT>.*</TEXT>",text, re.DOTALL)
text_1 = re.sub("<TEXT>\n","",text_1.group(0))
text_1 = re.sub("\n</TEXT>","",text_1)
# replace all types of quotations by normal quotes
text_1 = re.sub("\n"," ",text_1)
text_1 = re.sub(" +"," ",text_1)
# text_1 = re.sub("\'\'","\"",text_1)
# text_1 = re.sub("\`\`","\"",text_1)
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
lines = sent_tokenizer.tokenize(text_1.strip())
text_1 = lines
sentences = []
porter = nltk.PorterStemmer()
for sent in lines:
OG_sent = sent[:]
sent = sent.strip().lower()
line = nltk.word_tokenize(sent)
stemmed_sentence = [porter.stem(word) for word in line]
stemmed_sentence = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'"
and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmed_sentence)
if stemmed_sentence != []:
sentences.append(sentence(file_path_and_name, stemmed_sentence, OG_sent))
return sentences
except IOError:
print 'Oops! File not found',file_path_and_name
return [sentence(file_path_and_name, [],[])]
def get_features(df_features):
# now = datetime.datetime.now()
# print now.strftime('%Y-%m-%d %H:%M:%S')
# print "matchnouns"
# df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
# df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
# #df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1) #takes long
# df_features['z_noun_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_nouns, r.question2_nouns), axis = 1)
# now = datetime.datetime.now()
# print now.strftime('%Y-%m-%d %H:%M:%S')
# print "matchverb"
# df_features['question1_verbs'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B'])
# df_features['question2_verbs'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B'])
# #df_features['z_verb_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_verbs if w in r.question2_verbs]), axis=1) #takes long
# df_features['z_verb_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_verbs, r.question2_verbs), axis = 1)
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print "stem_tfidf"
df_features['q1_stem'] = df_features.question1.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')])
df_features['q2_stem'] = df_features.question2.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')])
#df_features['z_adj_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_adjs if w in r.question2_adjs]), axis=1) #takes long
df_features['z_stem_tfidf'] = df_features.apply(lambda r : tfidf_word_match_share(r.q1_stem, r.q2_stem), axis = 1)
now = datetime.datetime.now()
# print now.strftime('%Y-%m-%d %H:%M:%S')
# print('w2v tfidf...')
# df_features['z_tfidf_w2v'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1.tolist(), r.question2.tolist()), axis = 1)
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print('nouns...')
df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1) #takes long
print('lengths...')
df_features['z_len1'] = df_features.question1.map(lambda x: len(str(x)))
df_features['z_len2'] = df_features.question2.map(lambda x: len(str(x)))
df_features['z_word_len1'] = df_features.question1.map(lambda x: len(str(x).split()))
df_features['z_word_len2'] = df_features.question2.map(lambda x: len(str(x).split()))
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print('difflib...')
df_features['z_match_ratio'] = df_features.apply(lambda r: diff_ratios(r.question1, r.question2), axis=1) #takes long
print('word match...')
df_features['z_word_match'] = df_features.apply(word_match_share, axis=1, raw=True)
print('tfidf...')
df_features['z_tfidf_sum1'] = df_features.question1.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
df_features['z_tfidf_sum2'] = df_features.question2.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
df_features['z_tfidf_mean1'] = df_features.question1.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
df_features['z_tfidf_mean2'] = df_features.question2.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
df_features['z_tfidf_len1'] = df_features.question1.map(lambda x: len(tfidf.transform([str(x)]).data))
df_features['z_tfidf_len2'] = df_features.question2.map(lambda x: len(tfidf.transform([str(x)]).data))
return df_features.fillna(0.0)