def generate_vocabulary(self, review_summary_file):
"""
:param review_summary_file:
:return:
"""
self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values
for review,summary in self.rev_sum_pair:
rev_lst = wordpunct_tokenize(review)
sum_lst = wordpunct_tokenize(summary)
self.__add_list_to_dict(rev_lst)
self.__add_list_to_dict(sum_lst)
# Now store the "" empty string as the last word of the voacabulary
self.map[""] = len(self.map)
self.revmap[len(self.map)] = ""
python类wordpunct_tokenize()的实例源码
def tiny_tokenize(text, stem=False, stop_words=[]):
words = []
for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
text.decode(encoding='UTF-8', errors='ignore'))):
if not token.isdigit() and not token in stop_words:
if stem:
try:
w = EnglishStemmer().stem(token)
except Exception as e:
w = token
else:
w = token
words.append(w)
return words
# return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
# re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
# not token.isdigit() and not token in stop_words]
def tokenize(directorys):
full_content = ''
for _file in os.listdir(directory):
#disp_count = 5
with open(directory+_file,'r') as f:
contents = f.readlines()
for item in contents:
try:
sentence = item.split('\t')[1].strip()
full_content += sentence
except IndexError:
continue
# if np.random.binomial(1,0.1):
# print sentence
# time.sleep(2)
# disp_count -=1
# if not disp_count:
# print '*'*100
# break
# else:
# print '#'
return wordpunct_tokenize(full_content.lower())
def load_unannotated_file(filepath='test.txt', nb_instances=None, tokenized_input=False):
if tokenized_input:
instances = []
for line in codecs.open(filepath, 'r', 'utf8'):
line = line.strip()
if line:
instances.append(line)
if nb_instances:
nb_instances -= 1
if nb_instances <= 0:
break
return instances
else:
from nltk.tokenize import wordpunct_tokenize
W = re.compile(r'\s+')
with codecs.open(filepath, 'r', 'utf8') as f:
text = W.sub(f.read(), ' ')
tokens = wordpunct_tokenize(text)
if nb_instances:
return tokens[:nb_instances]
else:
return tokens
def _extract_tokens(self, file_text):
"""Extract tokens from a file and return a Counter dictionary.
This method is designed specifically so that it can be overridden
easily while maintaining _get_file_tokens and _get_dir_tokens.
"""
token_dict = collections.Counter()
# does a simple word and punctuation tokenization on the text
tokens = wordpunct_tokenize(file_text)
for token in tokens:
token_dict[token] += 1
return token_dict
def _extract_tokens(self, file_text):
"""Extract tokens from a Babel file and return a Counter dictionary."""
token_dict = collections.Counter()
# matches and removes beginning and end tags
regex = re.compile(r'\[\d*\.\d*\]\n(.*)')
matches = regex.findall(file_text)
tokens = set()
for match in matches:
wp_tokenized = wordpunct_tokenize(match)
tokens.update(wp_tokenized)
for token in tokens:
token_dict[token] += 1
return token_dict
def _get_revision_word_dist(self, page_title, revid):
""""""
revids_to_word_dist = self.ctitle_to_revids_to_word_dist[page_title]
if revid in revids_to_word_dist:
return revids_to_word_dist[revid]
text = self._get_revision_text(page_title, revid)
text = [word.lower() for word in wordpunct_tokenize(text)
if word.lower() not in STOPWORDS and word.lower() not in PUNCTUATION]
pdist = StatsCounter(text).normalize()
revids_to_word_dist[revid] = pdist
return pdist
def _get_revision_word_dist(self, page_title, revid):
""""""
revids_to_word_dist = self.ctitle_to_revids_to_word_dist[page_title]
if revid in revids_to_word_dist:
return revids_to_word_dist[revid]
text = self._get_revision_text(page_title, revid)
text = [word.lower() for word in wordpunct_tokenize(text)
if word.lower() not in STOPWORDS and word.lower() not in PUNCTUATION]
pdist = StatsCounter(text).normalize()
revids_to_word_dist[revid] = pdist
return pdist
def tokenize(text):
"""
:param text: a paragraph string
:return: a list of words
"""
try:
try:
txt = unicode(text, 'utf-8') # py2
except NameError:
txt = text # py3
words = wordpunct_tokenize(txt)
length = len(words)
except TypeError:
words, length = ['NA'], 0
return words, length
def augment(texts, dic_thes):
if prm.aug<2:
return texts
out = []
for text in texts:
words_orig = wordpunct_tokenize(text)
maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words
for j in range(prm.aug):
words = list(words_orig) #copy
for k in range(randint(1,maxrep)):
idx = randint(0,len(words)-1)
word = words[idx]
if word in dic_thes:
synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
#print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
words[idx] = dic_thes[word][synonym]
out.append(" ".join(words))
return out
def __init__(self, lines):
self.lookup = {}
self.max_len = 0
ensure_package_path()
from nltk.tokenize import wordpunct_tokenize as tokenize
for line in lines:
word_data = json.loads(line)
# capture both positive and negative, choose one at scoring time
pos_score, neg_score = word_data['pos'], word_data['neg']
terms = [word_data['word']]
# TODO: make the sentiment scorer configurable
if 'word_ar' in word_data:
terms.append(word_data['word_ar'])
if 'word_ur' in word_data:
terms.append(word_data['word_ur'])
for term in terms:
# if a scores exists for a term use the least neutral score
existing_scores = (0., 0.)
if term in self.lookup:
existing_scores = self.lookup[term]
self.lookup[term] = (max(pos_score, existing_scores[0]), max(neg_score, existing_scores[1]))
# update the maximum token length to check
self.max_len = max(len(tokenize(term)), self.max_len)
def extract_keywords(sentence, keywords):
# check if there are keywords for the sentence language
language = sentence['Language']
if language in keywords:
languageKeywords = keywords[language]
keywordMatches = []
if languageKeywords != None:
message = sentence['Sentence']
# tokenize the sentence
for keyword in sorted(languageKeywords):
keywordRegex = languageKeywords[keyword]
if keywordRegex.search(message):
# if match, add keyword canonical form to list
keywordMatches.append(keyword)
sentence['Keywords'] = keywordMatches
return sentence
def parseDocument(doc, vocab):
wordslist = list()
countslist = list()
doc = doc.lower()
tokens = wordpunct_tokenize(doc)
dictionary = dict()
for word in tokens:
if word in vocab:
wordtk = vocab[word]
if wordtk not in dictionary:
dictionary[wordtk] = 1
else:
dictionary[wordtk] += 1
wordslist.append(dictionary.keys())
countslist.append(dictionary.values())
return (wordslist[0], countslist[0])
def __generate_tensor(self, is_review, reverse=False):
"""
:param is_review:
:param reverse:
:return:
"""
seq_length = self.review_max_words if is_review else self.summary_max_words
total_rev_summary_pairs = self.rev_sum_pair.shape[0]
data_tensor = np.zeros([total_rev_summary_pairs,seq_length])
sample = self.rev_sum_pair[0::, 0] if is_review else self.rev_sum_pair[0::, 1]
for index, entry in enumerate(sample.tolist()):
index_lst = np.array([self.map[word.lower()] for word in wordpunct_tokenize(entry)])
# reverse if want to get backward form
if reverse:
index_lst = index_lst[::-1]
# Pad the list
if len(index_lst) <= seq_length:
index_lst = np.lib.pad(index_lst, (0,seq_length - index_lst.size), 'constant', constant_values=(0, 0))
else:
index_lst = index_lst[0:seq_length]
data_tensor[index] = index_lst
return data_tensor
def tiny_tokenize_xml(text, stem=False, stop_words=[]):
return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
not token.isdigit() and not token in stop_words]
def top_tokens(text):
freq_dict = defaultdict(int)
tokens = wordpunct_tokenize(text)
for token in tokens:
freq_dict[token] += 1
return sorted(freq_dict, key=freq_dict.get, reverse=True)
def wikipediaAction(message):
"""Makes the appropriate calls to the wikipedia API for answer wiki queries.
Args:
message: An incoming text message
processer: Instance of NLProcessor class
Returns:
A message indicating what action was taking with the wikipedia API
"""
# tokenize input
tokens = tokenize.wordpunct_tokenize(message)
# filter stopwords, additionally, remove 'wiki' or 'wikipedia'
tokens_filtered = remove_stopwords(tokens)
tokens_filtered = [token for token in tokens_filtered if token != 'wiki' and token != 'wikipedia']
# join filtered message
message = ' '.join(tokens_filtered)
# for debugging/testing
print("(Highly) processed input: ", message)
# Get the wikipedia summary for the request
try:
summary = wikipedia.summary(message, sentences = 1)
url = wikipedia.page(message).url
answer = summary + "\nSee more here: " + url
if len(answer) > 500:
answer = answer[0:500] + "\nSee wikipedia for more..."
except:
# handle all errors
answer = "Request was not found using Wikipedia. Be more specific?"
return answer
analytics_platform_util.py 文件源码
项目:fabric8-analytics-stack-analysis
作者: fabric8-analytics
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def create_tags_for_package(package_name):
"""Create tags for a package based on its name."""
stop_words = set(['org', 'com', 'io', 'ch', 'cn'])
tags = []
tags = set([tag.lower() for tag in wordpunct_tokenize(package_name) if
tag not in string.punctuation and tag not in stop_words
])
return list(tags)[:MAX_TAG_COUNT]
def analyze_false(validData,validDataNumbers,validLabels,model):
'Calculating precision and recall for best model...'
predictions = np.squeeze((model.predict(validDataNumbers) > 0.5).astype('int32'))
c1_inds = np.where(validLabels == 1)[0]
pos_inds = np.where((predictions+validLabels) == 2)[0] #np.squeeze(predictions) == validLabels
neg_inds = np.setdiff1d(c1_inds,pos_inds)
seq_lengths = np.zeros((validData.shape[0]))
for ind,row in np.ndenumerate(validData):
seq_lengths[ind] = len(wordpunct_tokenize(row.lower().strip()))
mean_true_length = np.mean(seq_lengths[pos_inds])
mean_false_length = np.mean(seq_lengths[neg_inds])
return mean_false_length,mean_true_length
def tokenize(directory,exclude_files):
full_content = ''
for _file in os.listdir(directory):
#disp_count = 5
if exclude_files and (_file in exclude_files):
continue
with open(directory+_file,'r') as f:
contents = f.readlines()
for item in contents:
try:
sentence = item.split('\t')[1].strip()
full_content += sentence
except IndexError:
continue
# if np.random.binomial(1,0.1):
# print sentence
# time.sleep(2)
# disp_count -=1
# if not disp_count:
# print '*'*100
# break
# else:
# print '#'
return wordpunct_tokenize(full_content.lower())
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
def _extract_tokens(self, file_text):
"""Extract tokens from a file and return a Counter dictionary."""
token_dict = collections.Counter()
# matches and removes beginning and end tags
regex = re.compile(r'(<doc id.*>|<\/doc>)')
data = regex.sub('', file_text)
tokens = wordpunct_tokenize(data)
for token in tokens:
token_dict[token] += 1
return token_dict
def get_words(sents = []):
from nltk.tokenize import wordpunct_tokenize
words = []
for sent in sents:
words.append(wordpunct_tokenize(sent))
return words
# file_name = sys.argv[1]
def tokenize_into_words(sents = []):
words = []
for sent in sents:
words.append(wordpunct_tokenize(sent))
return words
def _extract_text_ngram_freqs(self, text):
"""Tokenize the text.
For each token in the text, extract ngrams of different length (from 1
to 5). Compute how many times each of these ngrams occur in the text.
Then return a dictionary of { ngram: frequencies }.
>>> implementation = CavnarTrenkleImpl()
>>> ngrams = implementation._extract_text_ngram_freqs("HeLLo")
>>> ngrams == {'h':1, 'e': 1, 'l': 2, 'o': 1, 'he': 1, 'el': 1, 'll': 1, \
'lo': 1, 'hel': 1, 'ell': 1, 'llo': 1, 'hell': 1, 'ello': 1, 'hello': 1}
True
>>> ngrams = implementation._extract_text_ngram_freqs("CIAO")
>>> ngrams == {'c':1, 'i': 1, 'a': 1, 'o': 1, 'ci': 1, 'ia': 1, 'ao': 1, \
'cia': 1, 'iao': 1, 'ciao': 1}
True
"""
tokens = wordpunct_tokenize(text.lower()) # Force lower case
# TODO: Delete numbers and punctuation
# TODO: Should we use nltk twitter tokenizer?
ngram_freqs = defaultdict(int)
for token in tokens:
for n in range(1, 6): # Use 1-grams to 5-grams
for ngram in ngrams(token, n):
ngram_string = ''.join(ngram)
ngram_freqs[ngram_string] += 1
# ngram_freqs[ngrams(token, n)] += 1
return ngram_freqs
util.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
print "text_to_sentence"
#from nltk.tokenize import wordpunct_tokenize
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
text=text.decode("utf8")
from nltk.tokenize import sent_tokenize,wordpunct_tokenize
# 1. Use the NLTK tokenizer to split the paragraph into sentences
#raw_sentences = tokenizer.tokenize(text.strip())
raw_sentences = sent_tokenize(text.strip())
print "finish tokenize sentence",len(raw_sentences)
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
#print "sentence:",raw_sentence
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
#sentences.append( text_to_wordlist( raw_sentence, \
# remove_stopwords ))
#print removePunctuation(raw_sentence).lower().split()
print raw_sentence
sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
print wordpunct_tokenize(raw_sentence)
#print text_to_wordlist( raw_sentence, remove_stopwords )
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
def locateWord(word, wordsArr):
if word in wordsArr:
return wordsArr.index(word)
else:
idxs = [wordsArr.index(w) for w in wordsArr if word in wordpunct_tokenize(w)]
return idxs[0]
def negSent2JointTrain(negSents, posSentNum):
neg_training_data = []
for sentId, (sent_id, sent) in enumerate(negSents):
wordsIn = wordpunct_tokenize(sent)
sent = " ".join(wordsIn)
eventTypeSequence = ["O" for i in range(len(wordsIn))]
neg_training_data.append((str(sentId + posSentNum), sent, eventTypeSequence))
return neg_training_data
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks