def SpeechToText():
r = sr.Recognizer() #Speech recognition
with sr.Microphone() as source:
print("Say something!")
audio = r.listen(source)
message = r.recognize_google(audio)
print("Check: "+message)
try:
print("User: " + r.recognize_google(audio))
except sr.UnknownValueError:
print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Speech Recognition service; {0}".format(e))
return message
#function to find importance of words to use them to deduce that which thing is being asked more
python类words()的实例源码
readdata.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def preprocessing(text):
text = text.decode("utf8")
# tokenize into words
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
# remove stopwords
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# remove words less than three letters
tokens = [word for word in tokens if len(word) >= 3]
# lower capitalization
tokens = [word.lower() for word in tokens]
# lemmatize
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text= ' '.join(tokens)
return preprocessed_text
def collection_stats():
# list of documents
documents_stat = reuters.fileids()
print(str(len(documents)) + " documents")
train_docs_stat = list(filter(lambda doc: doc.startswith("train"), documents_stat))
print(str(len(train_docs_stat)) + " total training documents")
test_docs_stat = list(filter(lambda doc: doc.startswith("test"), documents_stat))
print(str(len(test_docs_stat) + " total test documents"))
# list of categories
categories = reuters.categories()
print(str(len(categories)) + " categories")
# get the documents in a category
category_docs = reuters.fileids("acq")
# words for a document
document_id = category_docs[0]
document_words = reuters.words(category_docs[0])
print(document_words)
# print the raw document
print(reuters.raw(document_id))
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
#print("Building collocations list")
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
print(tokenwrap(colloc_strings, separator="; "))
def get_user_to_word_proportion(user_to_text, word):
"""
Maps each user to the proportion of his words that consist of a specificied
word.
"""
user_to_word_proportion = {}
for user in user_to_text:
lm = LanuageModel(user_to_text[user])
n_tokens = len(lm.lowercase_tokens)
if n_tokens > 0:
fd = nltk.FreqDist(lm.lowercase_tokens)
user_to_word_proportion[user] = fd[word] / float(n_tokens)
else:
user_to_word_proportion[user] = 0.0
print 'Finished user {}'.format(user.encode('utf-8'))
return user_to_word_proportion
def generate(cfd, start_word, n):
word = start_word
words = []
for i in range(n):
words.append(word)
# word = cfd[word].max()
fd = cfd[word]
n_next_words = sum(fd.values())
if n_next_words > 0:
probabilities = [fd[w]/float(n_next_words) for w in sorted(fd.keys())]
word = choice(sorted(fd.keys()), p=probabilities)
else:
# Pick random word
old_word = word
# TODO: use unigram probabilities later
word = choice(cfd.keys())
words.append(word)
sentence = ' '.join(words)
# TODO: modify above for punctuation
return sentence
def rm_stop_words(data, mode="nltk",silent=1):
"""
Input:
data is a set, {} or Counter
"""
if silent==0:
print("remove stop words ...")
if mode == "nltk":
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
else:
print("unknown mode",mode)
assert 0
if isinstance(data,list):
data = [i for i in data if i.lower() not in stop_words]
return data
else:
for word in stop_words:
if word in data:
del data[word]
def words_to_char_sequence(words_list, tk):
"""Convert words list to chars sequence
# Arguments
words: word list, (sentence_len, word_len)
# Output shape
(sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
"""
c_seqs = np.zeros((len(words_list),
TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
for w_i in xrange(len(words_list)):
words = words_list[w_i]
fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
ws = tk.texts_to_sequences(words)
ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
else:
max_word_len = len(words)
fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
c_seqs[w_i] = fixed_ws
return c_seqs
def tiny_tokenize(text, stem=False, stop_words=[]):
words = []
for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
text.decode(encoding='UTF-8', errors='ignore'))):
if not token.isdigit() and not token in stop_words:
if stem:
try:
w = EnglishStemmer().stem(token)
except Exception as e:
w = token
else:
w = token
words.append(w)
return words
# return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
# re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
# not token.isdigit() and not token in stop_words]
def build_vocab(word_freq, threshold=5, topn=None, start_idx=0):
"""
threshold only take effects when topn is None.
words are indexed by overall frequency in the dataset.
"""
word_freq = sorted(word_freq.iteritems(), key=lambda d:d[1], reverse=True)
if topn:
word_freq = zip(*word_freq[:topn])[0]
vocab_dict = dict(zip(word_freq, range(start_idx, len(word_freq) + start_idx)))
else:
idx = start_idx
vocab_dict = {}
for word, freq in word_freq:
if freq < threshold:
return vocab_dict
vocab_dict[word] = idx
idx += 1
return vocab_dict
def bigrams(words, join_string, skip=0):
"""
Input: a list of words, e.g., ["I", "am", "Denny"]
Output: a list of bigram, e.g., ["I_am", "am_Denny"]
"""
assert type(words) == list
L = len(words)
if L > 1:
lst = []
for i in range(L - 1):
for k in range(1, skip + 2):
if i + k < L:
lst.append(join_string.join([words[i], words[i + k]]))
else:
# set it as unigram
lst = NgramUtil.unigrams(words)
return lst
def trigrams(words, join_string, skip=0):
"""
Input: a list of words, e.g., ["I", "am", "Denny"]
Output: a list of trigram, e.g., ["I_am_Denny"]
"""
assert type(words) == list
L = len(words)
if L > 2:
lst = []
for i in range(L - 2):
for k1 in range(1, skip + 2):
for k2 in range(1, skip + 2):
if i + k1 < L and i + k1 + k2 < L:
lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]]))
else:
# set it as bigram
lst = NgramUtil.bigrams(words, join_string, skip)
return lst
def biterms(words, join_string):
"""
Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
"""
assert type(words) == list
L = len(words)
if L > 1:
lst = []
for i in range(L - 1):
for j in range(i + 1, L):
lst.append(join_string.join([words[i], words[j]]))
else:
# set it as uniterm
lst = NgramUtil.uniterms(words)
return lst
def triterms(words, join_string):
"""
Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
"""
assert type(words) == list
L = len(words)
if L > 2:
lst = []
for i in xrange(L - 2):
for j in xrange(i + 1, L - 1):
for k in xrange(j + 1, L):
lst.append(join_string.join([words[i], words[j], words[k]]))
else:
# set it as biterm
lst = NgramUtil.biterms(words, join_string)
return lst
def fourterms(words, join_string):
"""
Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
"""
assert type(words) == list
L = len(words)
if L > 3:
lst = []
for i in xrange(L - 3):
for j in xrange(i + 1, L - 2):
for k in xrange(j + 1, L - 1):
for l in xrange(k + 1, L):
lst.append(join_string.join([words[i], words[j], words[k], words[l]]))
else:
# set it as triterm
lst = NgramUtil.triterms(words, join_string)
return lst
def ngrams(words, ngram, join_string=" "):
"""
wrapper for ngram
"""
if ngram == 1:
return NgramUtil.unigrams(words)
elif ngram == 2:
return NgramUtil.bigrams(words, join_string)
elif ngram == 3:
return NgramUtil.trigrams(words, join_string)
elif ngram == 4:
return NgramUtil.fourgrams(words, join_string)
elif ngram == 12:
unigram = NgramUtil.unigrams(words)
bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
return unigram + bigram
elif ngram == 123:
unigram = NgramUtil.unigrams(words)
bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
return unigram + bigram + trigram
def bigrams(words, join_string, skip=0):
"""
Input: a list of words, e.g., ["I", "am", "Denny"]
Output: a list of bigram, e.g., ["I_am", "am_Denny"]
"""
assert type(words) == list
L = len(words)
if L > 1:
lst = []
for i in range(L - 1):
for k in range(1, skip + 2):
if i + k < L:
lst.append(join_string.join([words[i], words[i + k]]))
else:
# set it as unigram
lst = NgramUtil.unigrams(words)
return lst
def biterms(words, join_string):
"""
Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
"""
assert type(words) == list
L = len(words)
if L > 1:
lst = []
for i in range(L - 1):
for j in range(i + 1, L):
lst.append(join_string.join([words[i], words[j]]))
else:
# set it as uniterm
lst = NgramUtil.uniterms(words)
return lst
def triterms(words, join_string):
"""
Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
"""
assert type(words) == list
L = len(words)
if L > 2:
lst = []
for i in xrange(L - 2):
for j in xrange(i + 1, L - 1):
for k in xrange(j + 1, L):
lst.append(join_string.join([words[i], words[j], words[k]]))
else:
# set it as biterm
lst = NgramUtil.biterms(words, join_string)
return lst
def fourterms(words, join_string):
"""
Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
"""
assert type(words) == list
L = len(words)
if L > 3:
lst = []
for i in xrange(L - 3):
for j in xrange(i + 1, L - 2):
for k in xrange(j + 1, L - 1):
for l in xrange(k + 1, L):
lst.append(join_string.join([words[i], words[j], words[k], words[l]]))
else:
# set it as triterm
lst = NgramUtil.triterms(words, join_string)
return lst
def build_vocabulary( words, max_size ):
vocab_instances = 0
unique_counts = Counter(words)
d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1], reverse=True) )
# start at 2 to leave room for padding & unknown
pb = Progress_bar(len(d) - 1)
for i, (key, value) in enumerate(vocabulary.items(), start=2):
vocab_instances += value
vocabulary[key] = i
pb.tick()
vocabulary[cfg.padding_char] = 0
vocabulary[cfg.placeholder_char] = 1
#reverse the vocbulary (for reverse lookup)
rev_vocabulary = {v: k for k, v in vocabulary.items()}
vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)
return vocab
def tokenize_text( sample_text ):
global sequence_lengths
processed_text = []
if cfg.remove_punctuation:
cleaned = sample_text.lower().translate( t_table )
else:
cleaned = sample_text
if cfg.use_casual_tokenizer:
tokens = tknzr.tokenize( cleaned )
else:
tokens = nltk.word_tokenize( cleaned, language='english')
if cfg.remove_stopwords:
tokens = [w for w in tokens if not w in stopwords.words('english')]
sequence_lengths.append( len( tokens ) )
processed_text.extend( tokens )
return processed_text
NewsAutosummarize.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def _init_(self, min_cut=0.1, max_cut=0.9):
# identation changes - we are inside the constructor
# here we set up the behaviour
# this is called each time an object of feq summ class is
# created or instantiated
self._min_cut = min_cut # self=keyword that reports the variable
self._max_cut = max_cut
# we save the val of the 2 parameters passed by assigning them
# two member variables - the 'self.' prefix identifies them as part
# of the self argument - using underscore as first char.
self._stopwords = set(stopwords.words('english') + list(punctuation))
# this is alist of all common words and punc symols
# identation changes - we are out of the constructor here
# This is still the body of the class
# Defining var here ( outside a member function) but within the class
# member var becomes STATIC. This means it belongs to the class, and not
# to any specific individual instance (object) of the class
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def extractFeatures(self, article, n, customStopWords=None):
# pass in article as a tuple ( text, title)
text = article[0]
# extract the text
title = article[1]
# extract the title
sentences = sent_tokenize(text)
# split text into sentences
word_sent = [word_tokenize(sentences.lower()) for a in sentences]
# split sentences into words
self._freq = self._compute_frequencies(word_sent, customStopWords)
# calculate word freq using member func created above
if n < 0:
# how many features (words) to return - a -ve number means
# no feature ( word) selection, just return all features
return nlargest(len(self._freq_keys()),
self._freq, key=self._freq.get)
else:
# here we say if calling e func has asked for a subset
# then return only the 'n' largest features, i.e. the
# most important words ( important == frequent, less stopwords)
return nlargest(n, self._freq, key=self._freq.get)
def similarity(c1, c2):
'''stop words are words like "it" and "the" , that have no massive impact on the
sentence'''
stop_words = list(stopwords.words("english"))
# Removes stop words in both sentences
c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
c1_words = Counter(dedupe(c1_cleaned))
c2_words = Counter(dedupe(c2_cleaned))
total_words = c1_words + c2_words
similarity_between_words = 0
for key, val in total_words.items():
''' Looks at whether the two articles share a word'''
if total_words[key] > 1:
similarity_between_words += 1
return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def _answer_stop_word_density(self, row):
"""Percentage of tokens in the answer are stopwords
- Args:
row(pandas.dataframe): input row vector
- Returns:
row(pandas.dataframe): ouput vector with new feature
"""
stop = stopwords.words('english')
answer = row.Answer
if answer:
tokens = answer.split()
num_tokens = len(tokens)
stop_word_in_answer = [i for i in tokens if i in stop]
num_stop_word_in_answer = len(stop_word_in_answer)
row['ANSWER_STOPWORD_DENSITY'] = float(
num_stop_word_in_answer) / num_tokens
return row
else:
row['ANSWER_STOPWORD_DENSITY'] = 0
return row
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def _answer_quantifier_density(self, row):
"""Percentage of tokens in the answer that are quantifier words
- Args:
row(pandas.dataframe): input pandas dataframe
- Returns:
row(pandas.dataframe): result a pandas dataframe with new feature
"""
answer = row.Answer
if answer:
tokens = answer.split()
answer_len = len(tokens)
quantifier_tokens = [
i for i in tokens if i in ling.QUANTIFIER_WORDS]
quantifier_tokens_len = len(quantifier_tokens)
row['ANSWER_QUANTIFIER_DENSITY'] = float(
quantifier_tokens_len) / answer_len
return row
else:
row['ANSWER_QUANTIFIER_DENSITY'] = 0
return row
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def _percentage_capitalized_word_in_answer(self, row):
"""Percentage of capitalized words in the sentence that are in the answer
- Args:
row(pandas.dataframe): input pandas dataframe
- Returns:
row(pandas.dataframe): result a pandas dataframe with new feature
"""
answer = row.Answer
sentence = row.Sentence
if answer is not None and sentence is not None:
tokens = sentence.split()
num_tokens = len(tokens)
cap_tokens = [i for i in tokens if i.isupper() == True]
cap_tokens_in_answer = [i for i in cap_tokens if i in answer]
row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = float(
len(cap_tokens_in_answer)) / num_tokens
return row
else:
row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = 0
return row
def get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt):
"""
Get overlap, idf weighted overlap, overlap excluding stopwords, and idf weighted overlap excluding stopwords.
"""
stoplist = set(stopwords.words('english'))
num_docs = len(sent_list_1)
overlap_feats = []
for s1, s2 in zip(sent_list_1, sent_list_2):
tokens_a_set, tokens_b_set = set(s1), set(s2)
intersect = tokens_a_set & tokens_b_set
overlap = len(intersect) / (len(tokens_a_set) + len(tokens_b_set))
idf_intersect = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect)
idf_weighted_overlap = idf_intersect / (len(tokens_a_set) + len(tokens_b_set))
tokens_a_set_no_stop = set(w for w in s1 if w not in stoplist)
tokens_b_set_no_stop = set(w for w in s2 if w not in stoplist)
intersect_no_stop = tokens_a_set_no_stop & tokens_b_set_no_stop
overlap_no_stop = len(intersect_no_stop) / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
idf_intersect_no_stop = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect_no_stop)
idf_weighted_overlap_no_stop = idf_intersect_no_stop / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
overlap_feats.append([overlap, idf_weighted_overlap, overlap_no_stop, idf_weighted_overlap_no_stop])
return overlap_feats
def get_similar_documents_for_query(model_id, text):
"""
Return documents similar to the query or an empty set if an error occurs or the query has no words after preprocessing
:param model_id:
:param text:
:return:
"""
model = db_utils.get_model(model_id)
topics_assignment = assign_topics_for_query(model_id, text)
if len(topics_assignment) != 0:
topics_vector = transform_topics_assignment_from_lda_to_vector(model['number_of_topics'], topics_assignment[0])
# print(topics_vector)
return get_similar_documents_by_vector(model_id, topics_vector)
else:
return []