def extract_ngrams2(sentences, stemmer, language, N=2):
'''
Parameter Arguments:
sentences: list of sentences
['Ney York is a city.', 'It has a huge population.']
N: Length of the n-grams e.g. 1, 2
return: a list of n-grams
[('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'),
('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
'''
ngrams_list = []
for sent in sentences:
sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled
ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N))
for i, ngram in enumerate(ngram_items):
ngram_str = ' '.join(ngram)
ngrams_list.append(ngram_str)
return ngrams_list
data_helpers.py 文件源码
python
阅读 28
收藏 0
点赞 0
评论 0
评论列表
文章目录