def ngram_list(n, word_list, stop_word_list=None):
"""
Generate ngrams with width n excluding those that are entirely formed of stop words
Args:
n (int): i.e. 1, 2, 3...
word_list (list of str): list of words
stop_word_list (list of str, Optional): list of words that should be excluded while obtaining
list of ngrams
Returns:
list of str: List of ngrams formed from the given word list except for those that have all their tokes in
stop words list
"""
stop_word_set = set(stop_word_list) if stop_word_list else []
all_ngrams = nltk.ngrams(word_list, n)
ngram_list = []
for ngram in all_ngrams:
lowered_ngram_tokens = map(lambda token: token.lower(), ngram)
if any(token not in stop_word_set for token in lowered_ngram_tokens):
ngram_list.append(' '.join(ngram))
return ngram_list
python类ngrams()的实例源码
def get(self,person_id):
n=2
occurs=[]
grams_arr=[]
sixgrams = ngrams(str_read.split(), n)
for grams in sixgrams:
#print str(grams)
x=NGram.compare('{}'.format(person_id),str(grams))
occurs.append(x)
grams_arr.append(str(grams))
main_fields={'occurs':fields.String,"word":fields.String}
datas={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))])}
x=marshal(datas,main_fields)
#json.dumps(marshal(datas,main_fields))
return x
def get(self,person_id):
n=2
occurs=[]
grams_arr=[]
sixgrams = ngrams(str_read.split(), n)
for grams in sixgrams:
#print str(grams)
x=NGram.compare('{}'.format(person_id.decode('latin-1')),str(grams))
occurs.append(x)
grams_arr.append(str(grams))
main_fields={'occurs':fields.String,"word":fields.String}
datas={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))])}
x=marshal(datas,main_fields)
#json.dumps(marshal(datas,main_fields))
return x
def extract_ngrams2(sentences, stemmer, language, N=2):
'''
Parameter Arguments:
sentences: list of sentences
['Ney York is a city.', 'It has a huge population.']
N: Length of the n-grams e.g. 1, 2
return: a list of n-grams
[('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'),
('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
'''
ngrams_list = []
for sent in sentences:
sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled
ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N))
for i, ngram in enumerate(ngram_items):
ngram_str = ' '.join(ngram)
ngrams_list.append(ngram_str)
return ngrams_list
def extract_nuggets(sentences, nugget_type, language):
'''
Parameter Arguments:
sentences: list of sentences
['Ney York is a city.', 'It has a huge population.']
return: a list of noun phrases, events, named_entities
[('new', 'york'), ('york', 'is'), ('a', 'city'),
('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
'''
nugget_list = []
for sent in sentences:
if nugget_type == 'n-grams':
nugget_items = list(ngrams(sent2stokens(sent, language), 2))
if nugget_type == 'NP':
nugget_items = get_phrases(sent, 'NP')
if nugget_type == 'Phrases':
nugget_items = get_phrases(sent, 'Phrases')
if nugget_type == 'NE':
nugget_items = get_phrases(sent, 'NE')
for nugget in nugget_items:
nugget_list.append(' '.join(nugget))
return nugget_list
def add_sentences(self, sentences):
"""
@type sentences: list[Sentence]
"""
counter = self.counter
G = self.G
for sent in sentences:
counter.update(ngrams(sent.tokens, self.N))
G.add_nodes_from(sent.tokens)
updated_edges = []
for v in counter.elements():
s = v[0]
t = v[1]
c = counter[v]
updated_edges.append((s, t, c))
G.add_weighted_edges_from(updated_edges)
def words2ngrams(sep, num, tokens):
'''Convert word tokens into ngrams. ngrams are n-length word tokens.
Punctuation is considered as a separate token.'''
content = read_tokens(tokens)
ngrams = list(nltk.ngrams(content, num))
write_csv(ngrams, str(sep))
def text2ngrams(sep, num, text):
'''Tokenize plain text into ngrams. ngrams are n-length word tokens.
Punctuation is considered as a separate token.'''
content = '\n'.join([open(f).read() for f in text])
try:
tokens = nltk.word_tokenize(content)
ngrams = list(nltk.ngrams(tokens, num))
write_csv(ngrams, str(sep))
except LookupError as err:
click.echo(message="Error with tokenization", nl=True)
click.echo(message="Have you run \"textkit download\"?", nl=True)
click.echo(message="\nOriginal Error:", nl=True)
click.echo(err)
def __init__(self, body, author='Anonymous'):
# accumulators
hashtags = []
# Now process cleaned up text with NLTK
words = []
bigrams = []
trigrams = []
quadgrams = []
sentences = []
words = word_tokenize(body)
sentences.extend(sent_tokenize(body))
# Strip whitespace from each sentence
sentences = [sentence.strip() for sentence in sentences]
bigrams = ngrams(body, 2)
trigrams = ngrams(body, 3)
quadgrams = ngrams(body, 4)
self.body = body
self.words = words
self.bigrams = bigrams
self.trigrams = trigrams
self.quadgrams = quadgrams
self.sentences = sentences
self.hashtags = hashtags
self.author = author
#TODO: Create "hashtags" from arbitrary number of rarest words
def build_ngrams(tokens, low, high):
LOGGER.debug("Building ngrams from %d to %d" % (low, high))
assert low <= high
assert low > 0
grams = {}
for n in range(low, high + 1):
grams[n] = [g for g in ngrams(tokens, n)]
return grams
def build_pos_ngrams(tagged, low, high):
LOGGER.debug("Building POS ngrams from %d to %d" % (low, high))
assert low <= high
assert low > 0
pos_tokens = []
pos_words = defaultdict(list)
for word, pos in tagged:
pos_tokens.append(pos)
pos_words[pos].append(word)
grams = {}
for n in range(low, high + 1):
grams[n] = [g for g in ngrams(pos_tokens, n)]
return grams, pos_words
def ngrams_extract(string):
if random.random() < SAMPLE_RATE:
print '[*]',string
l = list
grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5))
SIZE = 1024
vec = zeros((SIZE,))
for t in grams:
vec[hash(t)%SIZE]+=1
return log(vec+1.0)
def get_word_ngrams(sequence, n=3):
tokens = tokenize(sequence)
return [' '.join(ngram) for ngram in ngrams(tokens, n)]
def gen_training_features(self, bodies_fpath, stances_fpath):
print 'Generating training features'
self._train_bodies, self._train_stances = self._read(bodies_fpath, stances_fpath, True)
print 'Generating ngrams'
ng_start = time.time()
self._train_unigrams = self._gen_ngrams(1, self._train_bodies, self._train_stances)
ng_end = time.time()
print 'ngrams generation time: ', (ng_end - ng_start), 'seconds'
print 'Generating jaccard similarities'
js_start = time.time()
self.train_avg_sims, self.train_max_sims = self._gen_jaccard_sims(
self._train_bodies,
self._train_stances
)
js_end = time.time()
print 'jaccard similarity generation time: ', (js_end - js_start), 'seconds'
for i in range(len(self._train_stances)):
labeled_feature = ({
'unigrams':self._train_unigrams[i],
'avg_sims':self.train_avg_sims[i],
'max_sims':self.train_max_sims[i]},
self._train_stances[i]['Stance'])
self._labeled_feature_set.append(labeled_feature)
def _get_ngrams(self, text, n):
tokens = nltk.word_tokenize(text)
tokens = [ token.lower() for token in tokens if len(token) > 1 ]
return nltk.ngrams(tokens, n)
def _get_ngrams(self, text, n):
tokens = nltk.word_tokenize(text)
tokens = [ token.lower() for token in tokens if len(token) > 1 ]
ngram_list = list(nltk.ngrams(tokens, n))
return ngram_list
generate_ngram_simhash.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def get_word_ngrams(sequence, n=3):
tokens = tokenize(sequence)
return [' '.join(ngram) for ngram in ngrams(tokens, n)]
def naive_bayes(analysis):
tags = []
words = []
deps_cc = []
for sen in analysis["sentences"]:
tags += sen['pos']
words += sen['tokens']
deps_cc += sen["deps_cc"]
norm = normalize_title(tags, words)
f1 = []
current = list(nltk.ngrams(norm.split(), 1)) + list(nltk.ngrams(norm.split(), 2)) + list(nltk.ngrams(norm.split(),3))
ngram_list = [' '.join(list(g)) for g in current]
for pos in common_grams:
if pos in ngram_list:
f1.append(1)
else:
f1.append(0)
f1 = numpy.array(f1).reshape(1, len(f1))
#pos ngrams
f2 = []
current_pos = list(nltk.ngrams(tags, 1)) + list(nltk.ngrams(tags, 2)) + list(nltk.ngrams(tags,3))
ngram_list = [' '.join(list(g)) for g in current_pos]
for pos in common_pos_grams:
if pos in ngram_list:
f2.append(1)
else:
f2.append(0)
f2 = numpy.array(f2).reshape(1, len(f2))
# print f2.shape
# syntactic ngrams
f3 = []
current_sngrams = list(syntactic_n_gram(deps_cc, 1)) + list(syntactic_n_gram(deps_cc, 2)) + list(syntactic_n_gram(deps_cc, 3))
ngram_list = [' '.join(list(g)) for g in current_sngrams]
for pos in common_sn_grams:
if pos in ngram_list:
f3.append(1)
else:
f3.append(0)
f3 = numpy.array(f3).reshape(1, len(f3))
return [clf1.predict(f1)[0], clf2.predict(f2)[0], clf3.predict(f3)[0]]
def n_gram_analysis_simple(infile, gram, stop):
ngram = dict()
f = open(infile, "r" )
#f2 = codecs.open(outfile, "w+", "utf-8")
for l in f:
x = nltk.ngrams(l.split(),gram)
for w in x:
# if stop:
# if w not in stops:
# if w in ngram:
# ngram[w]+=1
# else:
# ngram[w]=1
if w in ngram:
ngram[w] += 1
else:
ngram[w] = 1
p = list(ngram.items())
p.sort(key = lambda x: -x[1])
print len(p)
for x in p[:10]:
sen = ' '.join(x[0])
cnt = int(x[1])
if cnt == 0:
cnt = 1
print sen, cnt
def getNGrams(raw_string, gram_nb):
xgrams = ngrams(raw_string.split(), gram_nb)
return xgrams
def get(self,param_word):
status=False
n=2
occurs=[]
grams_arr=[]
words=[]
for key in r_server.scan_iter():
words.append(key)
#sixgrams = ngrams(str_read.split(), n)
for keys in words:
#print str(grams)
x=NGram.compare('{}'.format(param_word.decode('latin-1')),str(keys))
occurs.append(x)
grams_arr.append(str(keys))
for key in r_server.scan_iter():
if key == param_word:
status=True
if status is True:
main_fields_true={"word":fields.String,"status":fields.Boolean}
datas_true={'word':"{}".format(param_word),'status':status}
x_true=marshal(datas_true,main_fields_true)
return x_true
else:
main_fields_false={'occurs':fields.String,"word":fields.String,"freq":fields.String,"status":fields.Boolean}
datas_false={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))]),'freq':r_server.get(param_word),'status':status}
x_false=marshal(datas_false,main_fields_false)
return x_false
#json.dumps(marshal(datas,main_fields))
#if datas["status"]==True:
# return datas["word"]
#else:
def extract_ngrams(sentences, stoplist, stemmer, language, n=2):
"""Extract the ngrams of words from the input sentences.
Args:
n (int): the number of words for ngrams, defaults to 2
"""
concepts = []
for i, sentence in enumerate(sentences):
# for each ngram of words
tokens = sent2tokens(sentence, language)
for j in range(len(tokens)-(n-1)):
# initialize ngram container
ngram = []
# for each token of the ngram
for k in range(j, j+n):
ngram.append(tokens[k].lower())
# do not consider ngrams containing punctuation marks
marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)]
if len(marks) > 0:
continue
# do not consider ngrams composed of only stopwords
stops = [t for t in ngram if t in stoplist]
if len(stops) == len(ngram):
continue
# stem the ngram
ngram = [stemmer.stem(t) for t in ngram]
# add the ngram to the concepts
concepts.append(' '.join(ngram))
return concepts
def prune_ngrams(ngrams, stoplist, N=2):
pruned_list = []
for ngram in ngrams:
items = ngram.split(' ')
i = 0
for item in items:
if item in stoplist: i += 1
if i < N:
pruned_list.append(ngram)
return pruned_list
def get_tech(text):
"""Get all technologies from the top 1000 tags on StackOverflow.
"""
sentences = sent_tokenize(text)
techs = set()
for s in sentences:
tokens = word_tokenize(s)
techs |= set(tag for tag in tags if tag in tokens)
bigrams = ['-'.join(ngram) for ngram in ngrams(tokens, 2)]
techs |= set(tag for tag in tags if tag in bigrams)
trigrams = ['-'.join(ngram) for ngram in ngrams(tokens, 3)]
techs |= set(tag for tag in tags if tag in trigrams)
return list(techs)