def load_data(path='conll2000.zip', min_freq=2):
path = get_file(path, origin='https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/conll2000.zip')
print(path)
archive = ZipFile(path, 'r')
train = _parse_data(archive.open('conll2000/train.txt'))
test = _parse_data(archive.open('conll2000/test.txt'))
archive.close()
word_counts = Counter(row[0].lower() for sample in train for row in sample)
vocab = ['<pad>', '<unk>'] + [w for w, f in iter(word_counts.items()) if f >= min_freq]
pos_tags = sorted(list(set(row[1] for sample in train + test for row in sample))) # in alphabetic order
chunk_tags = sorted(list(set(row[2] for sample in train + test for row in sample))) # in alphabetic order
train = _process_data(train, vocab, pos_tags, chunk_tags)
test = _process_data(test, vocab, pos_tags, chunk_tags)
return train, test, (vocab, pos_tags, chunk_tags)
评论列表
文章目录