def get_tech(text):
"""Get all technologies from the top 1000 tags on StackOverflow.
"""
sentences = sent_tokenize(text)
techs = set()
for s in sentences:
tokens = word_tokenize(s)
techs |= set(tag for tag in tags if tag in tokens)
bigrams = ['-'.join(ngram) for ngram in ngrams(tokens, 2)]
techs |= set(tag for tag in tags if tag in bigrams)
trigrams = ['-'.join(ngram) for ngram in ngrams(tokens, 3)]
techs |= set(tag for tag in tags if tag in trigrams)
return list(techs)
评论列表
文章目录