def preprocessing(text):
text = text.decode("utf8")
# tokenize into words
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
# remove stopwords
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# remove words less than three letters
tokens = [word for word in tokens if len(word) >= 3]
# lower capitalization
tokens = [word.lower() for word in tokens]
# lemmatize
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text= ' '.join(tokens)
return preprocessed_text
python类WordNetLemmatizer()的实例源码
readdata.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def test_ranker(options):
lemmatizer = WordNetLemmatizer()
words, answers, candidate_lfs = load_lf_test(options.data_dir)
r = ranker.LogLinear(options.word_dim, options.embedding_file, options.stopwords_file)
assert(os.path.exists(options.ranker_model_dir))
r.load_model(options.ranker_model_dir)
result_file = os.path.join(options.result_dir, 'test')
rf = open(result_file, 'w')
print ('testing...')
for word, answer, lf in iter_lf_test(words, answers, candidate_lfs):
lemma = [lemmatizer.lemmatize(w) for w in word]
selected = r.test(word, lemma, lf)
write_file(rf, selected[0], answer, selected[1])
rf.close()
print (getResults(result_file))
generate_ngram_pos_link.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def getPOSLinks(text):
wordnet_lemmatizer = WordNetLemmatizer()
text = nltk.word_tokenize(text)
pos = nltk.pos_tag(text)
links = []
link = []
active = False
for w in pos:
part = w[1]
word = w[0]
if(not active and (part[:2] == "DT" or part == "WP" or part == "VB" or part == "IN")):
active = True
if(active):
link.append(wordnet_lemmatizer.lemmatize(word))
#extract main body
if(active and (part == "PRP" or part[:2] == "NN" or part == "." )):
active = False
links.append(" ".join(link))
link = []
return links
def wordnet_lemmatize(word, pos='n'):
global _nltk_wordnet_lemmatizer
try:
_nltk_wordnet_lemmatizer
except NameError:
_nltk_wordnet_lemmatizer = WordNetLemmatizer()
return _nltk_wordnet_lemmatizer.lemmatize(word, penn2morphy(pos))
def split_ingr(x):
wnl=WordNetLemmatizer()
cleanlist=[]
lst = x.strip('[]').split(',')
cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
return cleanlist
#remove low-information words from ingredients, could use more
def __wordnet_lemmatizer(self):
"""Initializes WordNetLemmatizer
Returns:
Initializes WordNetLemmatizer
"""
self.lemmatizer = WordNetLemmatizer()
# Call lemmatize to avoid lazy load
_ = self.lemmatizer.lemmatize('start')
def __init__(self):
self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
self.wnl = WordNetLemmatizer()
self.dictionary = enchant.Dict('en')
self.inflengine = inflect.engine()
def __init__(self):
self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
self.wnl = WordNetLemmatizer()
self.dictionary = enchant.Dict('en')
self.lookup_table = {}
def __tokenizeWholeCorpora(self,pathToCorpora):
print 'Start tokenzing the corpora: %s' % (pathToCorpora)
punct = re.compile('[%s]' % re.escape(string.punctuation))
wnl = WordNetLemmatizer()
doc_count=0
train_set = []
doc_mapping = {}
link_mapping = {}
for f in glob(pathToCorpora+'/*'):
filereader = open(f, 'r')
article = filereader.readlines();filereader.close()
text = ''
try:
link = article[0]
title = article[1]
text = article[2].lower()
except IndexError:
continue
# Skip document length < min_length
if len(text) < self.min_length:
continue
text = punct.sub("",text) # Remove all punctuations
tokens = nltk.word_tokenize(text) # Tokenize the whole text
# Lemmatize every word and add to tokens list if the word is not in stopword
train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword])
# Build doc-mapping
doc_mapping[doc_count] = title
link_mapping[doc_count] = link
doc_count = doc_count+1
if doc_count % 10000 == 0:
print 'Have processed %i documents' % (doc_count)
print 'Finished tokenzing the copora: %s' % (pathToCorpora)
return doc_count,train_set,doc_mapping,link_mapping
def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ):
"""
Preprocess a list containing text documents stored as strings.
"""
token_pattern = re.compile(r"\b\w\w+\b", re.U)
if lemmatize:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
def normalize( x ):
x = x.lower()
if lemmatize:
return wnl.lemmatize(x)
return x
def custom_tokenizer( s ):
return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]
# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
if apply_norm:
norm_function = "l2"
else:
norm_function = None
tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range)
X = tfidf.fit_transform(docs)
terms = []
# store the vocabulary map
v = tfidf.vocabulary_
for i in range(len(v)):
terms.append("")
for term in v.keys():
terms[ v[term] ] = term
return (X,terms)
def __init__(self):
self.model = WordNetLemmatizer()
moods_dictionary_creation.py 文件源码
项目:LyricsMoodClassifier
作者: valeriaalampi
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_lemma(word):
l = WordNetLemmatizer()
return l.lemmatize(word)
lyrics_tokenization.py 文件源码
项目:LyricsMoodClassifier
作者: valeriaalampi
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def simple_lemmatizing(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(w) for w in tokens]
return lemmatized_tokens
def __lemmatize(self, lemma):
"""
Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
"""
string, tag = lemma
if tag in ('a', 'n', 'r', 'v'):
wnl = WordNetLemmatizer()
string = wnl.lemmatize(string, tag)
return (string, tag)
######################################################################
# POSITIONING.
def tokenizer(document):
"""
input: a string
output: a list of strings
converts a string into tokens and performs the following steps:
1. elimaintes non alphabetical characters
2. converts to lower case
3. lemmatizes using the nltk.stem.WordNetLemmatizer
4. splits into tokens
"""
text = re.sub('[^a-zA-Z]', ' ', document)
tokens = text.lower().split()
tokens = [lemmatizer(tkn) for tkn in tokens]
return tokens
def __wn_lemmatize(self, lemma):
"""
Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always
returns a (string, pos) pair. Lemmatizes even when the tag
isn't helpful, by ignoring it for stemming.
"""
string, tag = lemma
wnl = WordNetLemmatizer()
if tag in ('a', 'n', 'r', 'v'):
string = wnl.lemmatize(string, tag)
else:
string = wnl.lemmatize(string)
return (string, tag)
remove_stopwords_nltk.py 文件源码
项目:review-classification
作者: vishnupriyam
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def clean_review(review,stopwords):
result = ""
lemmatizer = WordNetLemmatizer()
for word in review:
#converts the word to its lemma form
word = lemmatizer.lemmatize(word)
#adds the word to the resultant review only if its not a stopword
if word not in stopwords:
#removes all non-alphabet characters
word = re.sub('[^A-Za-z ]','',word)
if(len(word) != 0):
result += word+" "
return result
def lemmatizer(text):
# '''Description: This function takes in the string of descriptions and return string with all words lemmatized
# Parameters: String of descriptions
# Output: String with all words lemmatized (ex. "meeting" to "meeting" if noun and "meet" if verb)'''
lemmatizer = WordNetLemmatizer()
lis = unicode(str(text), 'utf-8').split(" ")
lemm_words = [lemmatizer.lemmatize(word) for word in lis]
return " ".join(lemm_words)
def _lemma_(token):
if isinstance(token, str):
return _stem_(token)
if isinstance(token, unicode):
return _stem_(token)
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
p = get_wordnet_pos(token.pos()[0][1])
if p!=wordnet.VERB:
return _stem_(token[0])
rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
return rs
def train_ranker(options):
lemmatizer = WordNetLemmatizer()
words, answers, good_lfs, bad_lfs = load_lf_train(options.data_dir)
r = ranker.LogLinear(options.word_dim, options.embedding_file, options.stopwords_file)
trainer = optimizers[options.optimizer](r.model)
sents = 0
total_loss = 0.0
train_size = len(words)
i = 0
for epoch in range(options.epochs):
for word, answer, good_lf, bad_lf in iter_lf_train(words, answers, good_lfs, bad_lfs):
if len(good_lf) == 0:
continue
lemma = [lemmatizer.lemmatize(w) for w in word]
loss = r.train(word, lemma, good_lf, bad_lf)
sents += 1
if loss is not None:
total_loss += loss.scalar_value()
loss.backward()
trainer.update()
e = float(i) / train_size
if i % options.print_every == 0:
print('epoch {}: loss per sentence: {}'.format(e, total_loss / sents))
sents = 0
total_loss = 0.0
i += 1
print ('saving model...')
save_as = '%s/epoch%03d.ranker' % (options.result_dir, epoch)
r.save_model(save_as)
def find_match_word(hash_content, wordlist):
split_words = []
while len(hash_content) !=0:
#return the index of the matched word
word, index = check_match(hash_content,wordlist)
split_words.append(word)
#remove the matched words from the original tokens
hash_content = hash_content[len(hash_content)*(-1):index]
return split_words
#use WordNetLemmatizer to lemmatize the word
def text_clean(filename):
'''
Input: File path of script.
Output: List of all words in script lowercased, lemmatized, without punctuation.
'''
wnl = WordNetLemmatizer()
word_list = [word.decode("utf8", errors='ignore') for line in open(filename, 'r') for word in line.split()]
lemma_list = [wnl.lemmatize(word.lower()) for word in word_list]
return lemma_list
def preprocess(raw):
# Initialize Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Initialize Lemmatizer
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Decode Wiki Markup entities and remove markup
text = filter_wiki(raw)
text = re.sub(filter_more, '', text)
# clean and tokenize document string
text = text.lower()
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
tokens = [i for i in tokens if not i in en_stop]
# stem tokens
tokens = [lemma.lemmatize(i) for i in tokens]
# remove non alphabetic characters
tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]
# remove unigrams and bigrams
tokens = [i for i in tokens if len(i)>2]
return tokens
def preprocess_imageclef(raw):
# Initialize Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Initialize Lemmatizer
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Decode Wiki Markup entities and remove markup
text = filter_wiki(raw)
text = re.sub(filter_more, '', text)
# clean and tokenize document string
text = text.lower()
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
tokens = [i for i in tokens if not i in en_stop]
# stem tokens
tokens = [lemma.lemmatize(i) for i in tokens]
# remove non alphabetic characters
tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]
# remove unigrams and bigrams
tokens = [i for i in tokens if len(i)>2]
return (tokens, text)
def preprocess_wikidata(raw):
# Initialize Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Initialize Lemmatizer
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Decode Wiki Markup entities and remove markup
text = filter_wiki(raw)
text = re.sub(filter_more, '', text)
# clean and tokenize document string
text = text.lower().split('../img/')[0]
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
tokens = [i for i in tokens if not i in en_stop]
# stem tokens
tokens = [lemma.lemmatize(i) for i in tokens]
# remove non alphabetic characters
tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]
# remove unigrams and bigrams
tokens = [i for i in tokens if len(i)>2]
return (tokens, text)
def preprocess(raw):
# Initialize Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Initialize Lemmatizer
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Decode Wiki Markup entities and remove markup
text = filter_wiki(raw)
text = re.sub(filter_more, '', text)
# clean and tokenize document string
text = text.lower()
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
tokens = [i for i in tokens if not i in en_stop]
# stem tokens
tokens = [lemma.lemmatize(i) for i in tokens]
# remove non alphabetic characters
tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]
# remove unigrams and bigrams
tokens = [i for i in tokens if len(i)>2]
return tokens
Chapter 05_KNN n Naive Bayes.py 文件源码
项目:Statistics-for-Machine-Learning
作者: PacktPublishing
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
if stopwords is not None:
terms = [t for t in terms if t not in stopwords]
if only_N_J is not None: # include only nouns and verbs
tagged = nltk.pos_tag(terms)
terms = [t for t, pos in tagged if pos in tags]
if lemmatize is not None:
lem = WordNetLemmatizer()
terms = [lem.lemmatize(t) for t in terms]
if stem is not None:
stem = PorterStemmer()
terms = [stem.stem(t) for t in terms]
return terms
def __init__(self):
"""
Intialize memebers:
question_dist - generalized-question distribution of the assigned extraction
location.
"""
self.question_dist = defaultdict(lambda : defaultdict(lambda : 0))
self.lmtzr = WordNetLemmatizer()
def lemmatize(text):
lemmatizer = WordNetLemmatizer()
return ' '.join(lemmatizer.lemmatize(word) for word in text.split())