def get_named_entities(documents, mincount=10):
'''
given a list of texts find words that more than
50% of time start with a capital letter and return them as NE
'''
word_count = defaultdict(int)
word_capital = defaultdict(int)
NEs = []
token_pattern = r'(?u)(?<![#@])\b\w+\b'
tp = re.compile(token_pattern)
for doc in documents:
words = tp.findall(doc)
for word in words:
if word[0].isupper():
word_capital[word.lower()] += 1
word_count[word.lower()] += 1
for word, count in word_count.iteritems():
if count < mincount: continue
capital = word_capital[word]
percent = float(capital) / count
if percent > 0.7:
NEs.append(word)
return NEs
评论列表
文章目录