def get_list_1(need_tweet_list):
need_res_set=[]
for i in need_tweet_list:
for j in i.split():
if stemmer.stem(j.lower()) not in out_stem_list:
need_res_set.append(j.lower())
return list(set(need_res_set))
python类stem()的实例源码
def get_list_2(need_tweet_list):
need_res_set=[]
for i in need_tweet_list:
for j in i.split():
if stem2.stem(j.lower()) not in lanc_stem_list:
need_res_set.append(j.lower())
return list(set(need_res_set))
def get_set_1(need_tweet_list):
need_res_set=set()
for i in need_tweet_list:
for j in i.split():
if stemmer.stem(j.lower()) not in out_stem_list:
need_res_set.add(stemmer.stem(j.lower()))
return need_res_set
def resource_similarity_score_via_exact_word_match_1(need_res_set,offer_tweet_list):
if len(need_res_set)==0:
return 0
offer_res_set=set()
for i in offer_tweet_list:
for j in i.split():
if j not in out_stem_list:
offer_res_set.add(stemmer.stem(j.lower()))
return(len(offer_res_set&need_res_set)/len(need_res_set))
def __eq__(self, other):
return self.stem == other.stem
def __hash__(self):
return hash(self.stem)
def __init__(self, stemmer=None):
'''
@param stemmer: an object or module with a 'stem' method (defaults to
stemming.porter2)
@returns: a new L{Stemmer} object
'''
if not stemmer:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
self.stemmer = stemmer
def rate_tags(self, tags):
'''
@param tags: a list of tags to be assigned a rating
'''
term_count = collections.Counter(tags)
for t in tags:
# rating of a single tag is term frequency * weight
t.rating = term_count[t] / len(tags) * self.weights.get(t.stem, 1.0)
def review_to_words(review):
if isinstance(review, float):
review = str(review).encode("utf-8")
letters_only = re.sub("\W+", " ", review, flags=re.UNICODE)
words = letters_only.lower().split()
#nltk.data.path.append('./nltk_data/')
#stops = set(nltk.corpus.stopwords.words("portuguese"))
meaningful_words = words #[w for w in words if not w in stops]
#stemmer = RSLPStemmer()
meaningful_stemmed = meaningful_words #[stemmer.stem(w) for w in meaningful_words]
return(" ".join(meaningful_stemmed))
Chapter 05_KNN n Naive Bayes.py 文件源码
项目:Statistics-for-Machine-Learning
作者: PacktPublishing
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
if stopwords is not None:
terms = [t for t in terms if t not in stopwords]
if only_N_J is not None: # include only nouns and verbs
tagged = nltk.pos_tag(terms)
terms = [t for t, pos in tagged if pos in tags]
if lemmatize is not None:
lem = WordNetLemmatizer()
terms = [lem.lemmatize(t) for t in terms]
if stem is not None:
stem = PorterStemmer()
terms = [stem.stem(t) for t in terms]
return terms
def extract_terms_from_file(file_location, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
with open(file_location, 'r', encoding='iso-8859-1') as doc:
terms = []
for line in doc:
terms.extend(re.compile('\w+').findall(line.lower()))
# terms = re.compile('\w+').findall(doc
# .read()
# .replace('\n', '')
# .lower())
return clean_terms(terms, stopwords, lemmatize, stem, only_N_J)
def extract_terms_from_sentence(sentence, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
terms = re.compile('\w+').findall(sentence.lower())
return clean_terms(terms, stopwords, lemmatize, stem, only_N_J)
def addToSentenceScore(question, sentence):
score = 0
questionSet = set()
for item in question.split():
questionSet.add(morpher.stem(item.replace("?","")))
sentenceSet = set()
for item in sentence.split():
sentenceSet.add(morpher.stem(item.replace("?","")))
jaccard = float(len(questionSet.intersection(sentenceSet))) / float(len(questionSet.union(sentenceSet)))
common = ' '.join(sentenceSet.intersection(questionSet))
tagCommon = nltk.pos_tag(nltk.word_tokenize(common))
if tagCommon:
for item in tagCommon:
if 'VB' in item[1]:
score += 6
else:
score += 3
# Add sentence and score to a hashmap
sentenceScore[sentence] = score + (jaccard * 10)
return score
# PARSER TO TOKENIZE, REMOVE STOP WORDS, MORPHOLOGY, ADD TO SET
def parser(line):
tokLine = nltk.word_tokenize(line)
keywords = list(set(tokLine) - set(stopwords))
lineSet = set()
for item in keywords:
lineSet.add(morpher.stem(item.replace("?", "")))
return lineSet
# WORD MATCH
def worMatch(question, sentence):
score = 0
questionSet = set()
for item in question.split():
questionSet.add(morpher.stem(item.replace("?","")))
sentenceSet = set()
for item in sentence.split():
sentenceSet.add(morpher.stem(item.replace("?","")))
jaccard = float(len(questionSet.intersection(sentenceSet))) / float(len(questionSet.union(sentenceSet)))
common = ' '.join(sentenceSet & questionSet)
tagCommon = nltk.pos_tag(nltk.word_tokenize(common))
if tagCommon:
for item in tagCommon:
if 'VB' in item[1]:
score += 6
else:
score += 3
return score + (jaccard * 10)
# GET INPUT FILE NAME
def steam_words(self, word):
ps_obj = PorterStemmer() # creating the port steamer
steamed_word = ps_obj.stem(word)
return steamed_word # returns the steamed word to the main file .
# Natural Language displaying setneces .
def CleanReVerb(self):
fin_seed = open('../file/seed_ReVerb.txt', 'r')
fout_seed = open('../file/seed_ReVerb_clean.txt', 'w+')
fin_signature = open('../file/signature_ReVerb.txt', 'r')
fout_signature = open('../file/signature_ReVerb_clean.txt', 'w+')
while True:
line = fin_seed.readline()
if line:
if '***' in line:
fout_seed.write(line)
else:
mark, line = line.split(':', 1)
line = self.CleanStopWords(line)#?????
#????
line = line.split()
word_list = []
s = nltk.stem.SnowballStemmer('english')
for w in line:
w = s.stem(w)
word_list.append(w)
if len(word_list) > 0:
line = ' '.join(word_list)
fout_seed.write(mark + ':' + line + '\n')
else:
break
while True:
line = fin_signature.readline()
if line:
if '***' in line:
fout_signature.write(line)
else:
mark, line = line.split(':', 1)
line = self.CleanStopWords(line)#?????
#????
line = line.split()
word_list = []
s = nltk.stem.SnowballStemmer('english')
for w in line:
w = s.stem(w)
word_list.append(w)
if len(word_list) > 0:
line = ' '.join(word_list)
fout_signature.write(mark + ':' + line + '\n')
else:
break
fin_signature.close()
fout_signature.close()
def __call__(self, tags):
'''
@param tags: a list of (preferably stemmed) tags
@returns: a list of unique (multi)tags sorted by relevance
'''
# print tags
self.rate_tags(tags)
multitags = self.create_multitags(tags)
# keep most frequent version of each tag
clusters = collections.defaultdict(collections.Counter)
proper = collections.defaultdict(int)
ratings = collections.defaultdict(float)
for t in multitags:
clusters[t][t.string] += 1
if t.proper:
proper[t] += 1
ratings[t] = max(ratings[t], t.rating)
term_count = collections.Counter(multitags)
for t, cnt in term_count.iteritems():
t.string = clusters[t].most_common(1)[0][0]
proper_freq = proper[t] / cnt
if proper_freq >= 0.5:
t.proper = True
t.rating = ratings[t]
# purge duplicates, one-character tags and stopwords
unique_tags = set(t for t in term_count
if len(t.string) > 1 and t.rating > 0.0)
# remove redundant tags
for t, cnt in term_count.iteritems():
words = t.stem.split()
for l in xrange(1, len(words)):
for i in xrange(len(words) - l + 1):
s = Tag(' '.join(words[i:i + l]))
relative_freq = cnt / term_count[s]
if ((relative_freq == 1.0 and t.proper) or
(relative_freq >= 0.5 and t.rating > 0.0)):
unique_tags.discard(s)
else:
unique_tags.discard(t)
# print unique_tags
return sorted(unique_tags)
def docs_to_networkx(dataset, cats, window_size=2, vocabulary_creation=True):
ds = './datasets/%s/' % dataset
Gs = []
labels = []
type_ = 2
vocab_creation = vocabulary_creation
words = [] # for vocabulary
for doc in os.listdir(ds):
if 'train.txt' in doc:
type_ = 1
if type_ == 1:
if os.path.exists("ds/vocab.txt"):
vocab_creation = False
with open(ds + '/train.txt', 'r', encoding='iso-8859-1') as doc:
dc = 1
for line in doc:
label = line[0]
labels.append(label)
terms = extract_terms_from_sentence(line[1:],
stopwords=stopwords.words('english'),
lemmatize=True,
stem=True,
only_N_J=True)
if vocab_creation:
words.extend(terms)
graph = terms_to_graph(terms, window_size)
G = graph_to_networkx(graph, name=label + '_' + str(dc))
# G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
nx.set_node_attributes(G, 'label', dict(zip(G.nodes(), G.nodes())))
Gs.append(G)
dc += 1
else:
if os.path.exists("ds/vocab.txt"):
vocab_creation = False
for cat in cats.keys():
for doc in os.listdir(ds + cat):
terms = extract_terms_from_file(ds + cat + '/' + doc,
stopwords=stopwords.words('english'),
lemmatize=True,
stem=True,
only_N_J=True)
if vocab_creation:
words.extend(terms)
graph = terms_to_graph(terms, window_size)
G = graph_to_networkx(graph, name=cat + doc.split('.')[0])
# G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
nx.set_node_attributes(G, name='label', values=dict(zip(G.nodes(), G.nodes())))
Gs.append(G)
labels.append(cats[cat])
if vocab_creation:
vocab = dict(Counter(words))
create_vocabulary_file(fname, vocab)
return Gs, labels
# needs fix or discard