def __call__(self, tags):
'''
@param tags: a list of (preferably stemmed) tags
@returns: a list of unique (multi)tags sorted by relevance
'''
# print tags
self.rate_tags(tags)
multitags = self.create_multitags(tags)
# keep most frequent version of each tag
clusters = collections.defaultdict(collections.Counter)
proper = collections.defaultdict(int)
ratings = collections.defaultdict(float)
for t in multitags:
clusters[t][t.string] += 1
if t.proper:
proper[t] += 1
ratings[t] = max(ratings[t], t.rating)
term_count = collections.Counter(multitags)
for t, cnt in term_count.iteritems():
t.string = clusters[t].most_common(1)[0][0]
proper_freq = proper[t] / cnt
if proper_freq >= 0.5:
t.proper = True
t.rating = ratings[t]
# purge duplicates, one-character tags and stopwords
unique_tags = set(t for t in term_count
if len(t.string) > 1 and t.rating > 0.0)
# remove redundant tags
for t, cnt in term_count.iteritems():
words = t.stem.split()
for l in xrange(1, len(words)):
for i in xrange(len(words) - l + 1):
s = Tag(' '.join(words[i:i + l]))
relative_freq = cnt / term_count[s]
if ((relative_freq == 1.0 and t.proper) or
(relative_freq >= 0.5 and t.rating > 0.0)):
unique_tags.discard(s)
else:
unique_tags.discard(t)
# print unique_tags
return sorted(unique_tags)
评论列表
文章目录