def load_from_export_format(export_file, encoding):
trees = []
SCRIPT_FOLDER = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
hf = HeadFinder(join(SCRIPT_FOLDER, "negra.headrules"))
with codecs.open(export_file, encoding=encoding) as fh:
sent_id = None
buffered_lines = []
for line in fh:
if line.startswith("#BOS"):
sent_id = int(line.split(" ")[1])
elif line.startswith("#EOS"):
sent_id2 = int(line.split(" ")[1])
assert(sent_id == sent_id2)
if len(buffered_lines) > 0:
tree = _give_me_a_tree_from_export_format(buffered_lines)
tree.attributes["sent_id"] = sent_id
hf.mark_head(tree)
trees.append(tree)
else:
trees.append(None)
if sent_id % 1000 == 0:
print("loaded %d trees" % sent_id, file=stderr)
stderr.flush()
sent_id = None
buffered_lines = []
elif sent_id is not None:
buffered_lines.append(line)
else:
raise Exception("oh nooooooooo")
return trees
评论列表
文章目录