def read_data(source_path, word_alphabet, pos_alphabet, type_alphabet, max_size=None, normalize_digits=True):
logger = utils.get_logger("Reading Data")
data = [[] for _ in _buckets]
counter = 0
reader = Reader(source_path, word_alphabet, pos_alphabet, type_alphabet)
inst = reader.getNext(normalize_digits)
while inst is not None and (not max_size or counter < max_size):
counter += 1
if counter % 10000 == 0:
logger.info("reading data: %d" % counter)
inst_size = inst.length()
for bucket_id, bucket_size in enumerate(_buckets):
if inst_size <= bucket_size:
data[bucket_id].append([inst.word_ids, inst.pos_ids, inst.heads, inst.type_ids])
break
inst = reader.getNext(normalize_digits)
reader.close()
logger.info("Total number of data: %d" % counter)
return data
评论列表
文章目录