def extractFeatures(self, article, n, customStopWords=None):
# pass in article as a tuple ( text, title)
text = article[0]
# extract the text
title = article[1]
# extract the title
sentences = sent_tokenize(text)
# split text into sentences
word_sent = [word_tokenize(sentences.lower()) for a in sentences]
# split sentences into words
self._freq = self._compute_frequencies(word_sent, customStopWords)
# calculate word freq using member func created above
if n < 0:
# how many features (words) to return - a -ve number means
# no feature ( word) selection, just return all features
return nlargest(len(self._freq_keys()),
self._freq, key=self._freq.get)
else:
# here we say if calling e func has asked for a subset
# then return only the 'n' largest features, i.e. the
# most important words ( important == frequent, less stopwords)
return nlargest(n, self._freq, key=self._freq.get)
python类nlargest()的实例源码
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 42
收藏 0
点赞 0
评论 0
def summarize(self, article, n):
text = article[0]
text = article[1]
sentences = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sentences]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i, sentence in enumerate(word_sent):
for word in sentence:
if word in self._freq:
ranking[i] += self._freq[word]
sentences_index = nlargest(n, ranking, key=ranking.get)
return [sentences[j] for j in sentences_index]
##############################################################################
# TEST
def create_ranking2(edge_weight, k, adj, num):
sink = len(adj)
heaps = [[] for i in xrange(sink + 1)]
heaps[0] = [(0, [])]
for current in xrange(sink):
for child in adj[current]:
for length, path in heaps[current]:
new_path = list(path)
new_path.append(current)
# this can be done better using this heapreplace
ew = edge_weight[0, num[(current, child)]]
heapq.heappush(heaps[child], (length + ew, new_path))
heaps[child] = heapq.nlargest(k, heaps[child])
# TODO what with equal lenght paths?
# result: heaps[sink]
return [(length, tuple(zip(nodes, nodes[1:] + [sink]))) for length, nodes in heaps[sink]]
def top(self, num, key=None):
"""
Get the top N elements from an RDD.
.. note:: This method should only be used if the resulting array is expected
to be small, as all the data is loaded into the driver's memory.
.. note:: It returns the list sorted in descending order.
>>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
[12]
>>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)
[6, 5]
>>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)
[4, 3, 2]
"""
def topIterator(iterator):
yield heapq.nlargest(num, iterator, key=key)
def merge(a, b):
return heapq.nlargest(num, a + b, key=key)
return self.mapPartitions(topIterator).reduce(merge)
def nth_largest(n, iter_list):
"""``O(nlogn)`` time if ``n`` is median.
Better if largest or smallest.
Notes
-----
Adopted and/or modified from reference(s):
FogleBird on stackoverflow.com/questions/1034846/
"""
length = len(iter_list)
if n >= length:
return heapq.nlargest(length, iter_list)[-1]
return heapq.nlargest(n, iter_list)[-1]
# OS utilities
def get_scored_matches(word: str, possibilities: List[str], n: int=3, cutoff: float=0.6) -> List[Tuple[float, str]]:
if not n > 0:
raise ValueError("n must be > 0: %r" % (n,))
if not (0.0 <= cutoff <= 1.0):
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
result = []
s: SequenceMatcher = SequenceMatcher()
s.set_seq2(word)
for x in possibilities:
s.set_seq1(x)
if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff:
result.append((s.ratio(), x))
# Move the best scorers to head of list
result = heapq.nlargest(n, result)
# Strip scores for the best n matches
return result
def build_dictionary(sentences, size):
"""
Create dictionary containing most frequent words in the sentences
:param sentences: sequence of sentence that contains words
Caution: the sequence might be exhausted after calling this function!
:param size: size of dictionary you want
:return: dictionary that maps word to index (starting from 1)
"""
dictionary = defaultdict(int)
for sentence in sentences:
for token in sentence:
dictionary[token] += 1
frequent_pairs = nlargest(size, dictionary.items(), itemgetter(1))
words, frequencies = zip(*frequent_pairs)
result = {word: index + 1 for index, word in enumerate(words)}
return result
def keyphrases(self, N=20, fileids=None, categories=None):
"""
Returns the top N keyphrases grouped by document id.
TODO: this currently ignores fileids/categories.
"""
if not self.tfidfs or not self.lexicon or not self.fileids:
raise ValueError("Must call the score method first!")
for idx, doc in enumerate(self.tfidfs):
fileid = self.fileids[idx]
# Get the top N terms by TF-IDF score
scores = [
(self.lexicon[wid], score)
for wid, score in heapq.nlargest(N, doc, key=itemgetter(1))
]
yield fileid, scores
def correlate_library(image, library, n_largest):
"""Correlates all simulated diffraction templates in a DiffractionLibrary
with a particular experimental diffraction pattern (image) stored as a
numpy array.
"""
i=0
out_arr = np.zeros((n_largest * len(library),5))
for key in library.keys():
if n_largest:
pass
else:
n_largest=len(library[key])
correlations = dict()
for orientation, diffraction_pattern in library[key].items():
correlation = correlate(image, diffraction_pattern)
correlations[orientation] = correlation
res = nlargest(n_largest, correlations.items(), key=itemgetter(1))
for j in np.arange(n_largest):
out_arr[j + i*n_largest][0] = i
out_arr[j + i*n_largest][1] = res[j][0][0]
out_arr[j + i*n_largest][2] = res[j][0][1]
out_arr[j + i*n_largest][3] = res[j][0][2]
out_arr[j + i*n_largest][4] = res[j][1]
i = i + 1
return out_arr
def determine_intent(self, utterance, num_results=1):
"""
Given an utterance, provide a valid intent.
:param utterance: an ascii or unicode string representing natural language speech
:param num_results: a maximum number of results to be returned.
:return: A generator the yields dictionaries.
"""
intents = []
for domain in self.domains:
gen = self.domains[domain].determine_intent(utterance=utterance,
num_results=1)
for intent in gen:
intents.append(intent)
heapq.nlargest(
num_results, intents, key=lambda domain: domain['confidence'])
for intent in intents:
yield intent
def nbest_centrality(G, metric, n=10, attr="centrality", **kwargs):
# Compute the centrality scores for each vertex
scores = metric(G, **kwargs)
# Set the score as a property on each node
nx.set_node_attributes(G, attr, scores)
# Filter scores (do not include in book)
ntypes = nx.get_node_attributes(G, 'type')
phrases = [
item for item in scores.items()
if ntypes.get(item[0], None) == "keyphrase"
]
# Find the top n scores and print them along with their index
topn = heapq.nlargest(n, phrases, key=itemgetter(1))
for idx, item in enumerate(topn):
print("{}. {}: {:0.4f}".format(idx+1, *item))
return G
def newusers(self, ctx, *, count=5):
"""Tells you the newest members of the server.
This is useful to check if any suspicious members have joined.
The minimum is 3 members. If no number is given I'll show the last 5 members.
"""
human_delta = time.human_timedelta
count = max(count, 3)
members = heapq.nlargest(count, ctx.guild.members, key=attrgetter('joined_at'))
names = map(str, members)
values = (
(f'**Joined:** {human_delta(member.joined_at)}\n'
f'**Created:** {human_delta(member.created_at)}\n{"-" * 40}')
for member in members
)
entries = zip(names, values)
title = f'The {formats.pluralize(**{"newest members": len(members)})}'
pages = EmbedFieldPages(ctx, entries, lines_per_page=5, colour=0x00FF00, title=title)
await pages.interact()
def largest_export_versions(n):
"""Creates a filter that keeps the largest n export versions.
Args:
n: number of versions to keep.
Returns:
A filter function that keeps the n largest paths.
"""
def keep(paths):
heap = []
for idx, path in enumerate(paths):
if path.export_version is not None:
heapq.heappush(heap, (path.export_version, idx))
keepers = [paths[i] for _, i in heapq.nlargest(n, heap)]
return sorted(keepers)
return keep
def largest_export_versions(n):
"""Creates a filter that keeps the largest n export versions.
Args:
n: number of versions to keep.
Returns:
A filter function that keeps the n largest paths.
"""
def keep(paths):
heap = []
for idx, path in enumerate(paths):
if path.export_version is not None:
heapq.heappush(heap, (path.export_version, idx))
keepers = [paths[i] for _, i in heapq.nlargest(n, heap)]
return sorted(keepers)
return keep
def estimate(self, u, i):
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
x, y = self.switch(u, i)
neighbors = [(self.sim[x, x2], r) for (x2, r) in self.yr[y]]
k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
# compute weighted average
sum_sim = sum_ratings = actual_k = 0
for (sim, r) in k_neighbors:
if sim > 0:
sum_sim += sim
sum_ratings += sim * r
actual_k += 1
if actual_k < self.min_k:
raise PredictionImpossible('Not enough neighbors.')
est = sum_ratings / sum_sim
details = {'actual_k': actual_k}
return est, details
def GetLeastNumbers(self, tinput, k):
import heapq
if tinput == None or len(tinput) < k or len(tinput) <= 0 or k <= 0:
return []
output = []
for number in tinput:
if len(output) < k:
output.append(number)
else:
# ?????? ???
# output = heapq.nsmallest(k, output)
# if number >= output[-1]:
# continue
# else:
# output[-1] = number
# ?????? ??
output = heapq.nlargest(k, output)
if number >= output[0]:
continue
else:
output[0] = number
return output[::-1] # ???? return output
def findIDcnt(countours):
#????????
widths = []
for idx, cnt in enumerate(countours):
x, y, width, height = cv2.boundingRect(cnt)
widths.insert(idx, width)
#???????????
IDList = heapq.nlargest(3, widths)
#???????????????????
IDcnts = []
for idx, item in enumerate(IDList):
index = widths.index(item)
IDcnts.insert(idx, countours[index])
# print IDcnts
return IDcnts
# ????
def make_submit(self, model, submit_file):
data = self.eval_sets().values()[0]
target_lines = list()
answers = np.asarray([[idx] for idx in self.entity.keys()])
for i, d in enumerate(data):
num_candidate = len(self.entity)
index_entities = xrange(num_candidate)
terms = d.split('\t')
subjects = np.asarray([[terms[0]]] * num_candidate)
relations = np.asarray([[terms[1]]] * num_candidate)
sims = model.predict([subjects, relations, answers], batch_size=num_candidate).flatten()
print(i)
r = rankdata(sims, method='ordinal')
index_candidates = nlargest(200, index_entities, key=lambda j: r[j])
one_line = ' '.join([str(index_candidate) for index_candidate in index_candidates])
target_lines.append(one_line + '\n')
submit_file.writelines(target_lines)
def make_submit_rt(self, model, submit_file):
data = self.eval_sets_rt().values()[0]
target_lines = list()
answers = np.asarray([[idx] for idx in self.entity.keys()])
for i, d in enumerate(data):
num_candidate = len(self.entity)
index_entities = xrange(num_candidate)
terms = d.split('\t')
relations = np.asarray([[terms[0]]] * num_candidate)
objects = np.asarray([[terms[1]]] * num_candidate)
sims = model.predict_rt([answers, relations, objects], batch_size=num_candidate).flatten()
print(i)
r = rankdata(sims, method='ordinal')
index_candidates = nlargest(200, index_entities, key=lambda j: r[j])
one_line = ' '.join([str(index_candidate) for index_candidate in index_candidates])
target_lines.append(one_line + '\n')
submit_file.writelines(target_lines)
def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5):
"""Get a list of the best matches to a collection of choices.
Convenience function for getting the choices with best scores.
Args:
query: A string to match against
choices: A list or dictionary of choices, suitable for use with
extract().
processor: Optional function for transforming choices before matching.
See extract().
scorer: Scoring function for extract().
score_cutoff: Optional argument for score threshold. No matches with
a score less than this number will be returned. Defaults to 0.
limit: Optional maximum for the number of elements returned. Defaults
to 5.
Returns: A a list of (match, score) tuples.
"""
best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \
sorted(best_list, key=lambda i: i[1], reverse=True)
def top(self, num, key=None):
"""
Get the top N elements from a RDD.
Note: It returns the list sorted in descending order.
>>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
[12]
>>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)
[6, 5]
>>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)
[4, 3, 2]
"""
def topIterator(iterator):
yield heapq.nlargest(num, iterator, key=key)
def merge(a, b):
return heapq.nlargest(num, a + b, key=key)
return self.mapPartitions(topIterator).reduce(merge)
def _choose_vacant_home_or_vacant_lot(self):
"""Choose a vacant home to move into or a vacant lot to build on.
Currently, a person scores all the vacant homes/lots in town and then selects
one of the top three. TODO: Probabilistically select from all homes/lots using the
scores to derive likelihoods of selecting each.
"""
home_and_lot_scores = self._rate_all_vacant_homes_and_vacant_lots()
if len(home_and_lot_scores) >= 3:
# Pick from top three
top_three_choices = heapq.nlargest(3, home_and_lot_scores, key=home_and_lot_scores.get)
if random.random() < 0.6:
choice = top_three_choices[0]
elif random.random() < 0.9:
choice = top_three_choices[1]
else:
choice = top_three_choices[2]
elif home_and_lot_scores:
choice = list(home_and_lot_scores)[0]
else:
choice = None
return choice
def _init_acquire_currently_occupied_lot(self):
"""If there are no vacant lots in town, acquire a lot and demolish the home currently on it."""
lot_scores = self._rate_all_occupied_lots()
if len(lot_scores) >= 3:
# Pick from top three
top_three_choices = heapq.nlargest(3, lot_scores, key=lot_scores.get)
if random.random() < 0.6:
choice = top_three_choices[0]
elif random.random() < 0.9:
choice = top_three_choices[1]
else:
choice = top_three_choices[2]
elif lot_scores:
choice = max(lot_scores)
else:
raise Exception("A company attempted to secure an *occupied* lot in town but somehow could not.")
return choice
def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
"""Get a list of the best matches to a collection of choices.
Convenience function for getting the choices with best scores.
Args:
query: A string to match against
choices: A list or dictionary of choices, suitable for use with
extract().
processor: Optional function for transforming choices before matching.
See extract().
scorer: Scoring function for extract().
score_cutoff: Optional argument for score threshold. No matches with
a score less than this number will be returned. Defaults to 0.
limit: Optional maximum for the number of elements returned. Defaults
to 5.
Returns: A a list of (match, score) tuples.
"""
best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \
sorted(best_list, key=lambda i: i[1], reverse=True)
def _init_acquire_currently_occupied_lot(self):
"""If there are no vacant lots in town, acquire a lot and demolish the home currently on it."""
lot_scores = self._rate_all_occupied_lots()
if len(lot_scores) >= 3:
# Pick from top three
top_three_choices = heapq.nlargest(3, lot_scores, key=lot_scores.get)
if random.random() < 0.6:
choice = top_three_choices[0]
elif random.random() < 0.9:
choice = top_three_choices[1]
else:
choice = top_three_choices[2]
elif lot_scores:
choice = max(lot_scores)
else:
raise Exception("A company attempted to secure an *occupied* lot in town but somehow could not.")
return choice
def most_common(self, n=None):
'''List the n most common elements and their counts from the most
common to the least. If n is None, then list all element counts.
>>> Counter('abcdeabcdabcaba').most_common(3)
[('a', 5), ('b', 4), ('c', 3)]
'''
# Emulate Bag.sortedByCount from Smalltalk
if n is None:
return sorted(self.items(), key=_itemgetter(1), reverse=True)
return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
def most_common(self, n=None):
'''List the n most common elements and their counts from the most
common to the least. If n is None, then list all element counts.
>>> Counter('abcdeabcdabcaba').most_common(3)
[('a', 5), ('b', 4), ('c', 3)]
'''
# Emulate Bag.sortedByCount from Smalltalk
if n is None:
return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
def global_search(cls, text, limit, menu='ir.ui.menu'):
"""
Search on models for text including menu
Returns a list of tuple (ratio, model, model_name, id, name, icon)
The size of the list is limited to limit
"""
pool = Pool()
ModelAccess = pool.get('ir.model.access')
if not limit > 0:
raise ValueError('limit must be > 0: %r' % (limit,))
models = cls.search(['OR',
('global_search_p', '=', True),
('model', '=', menu),
])
access = ModelAccess.get_access([m.model for m in models])
s = StringMatcher()
if isinstance(text, str):
text = text.decode('utf-8')
s.set_seq2(text)
def generate():
for model in models:
if not access[model.model]['read']:
continue
Model = pool.get(model.model)
if not hasattr(Model, 'search_global'):
continue
for record, name, icon in Model.search_global(text):
if isinstance(name, str):
name = name.decode('utf-8')
s.set_seq1(name)
yield (s.ratio(), model.model, model.rec_name,
record.id, name, icon)
return heapq.nlargest(int(limit), generate())
def closest(self, w, n=10):
"""
Assumes the vectors have been normalized.
"""
scores = self.m.dot(self.represent(w))
return heapq.nlargest(n, zip(scores, self.iw))
def closest_contexts(self, w, n=10):
scores = self.ec.m.dot(self.ew.represent(w))
pairs = zip(scores, self.ec.iw)[1:]
return heapq.nlargest(n, pairs)