def build_dataset(data_file):
global word2id,char2id,word2cluster,upos2id,xpos2id
sent_words_list = []
sent_chars_list = []
sent_clusters_list = []
sent_upos_list = []
sent_xpos_list = []
words_list = []
chars_list = []
clusters_list = []
upos_list = []
xpos_list = []
for line in open(data_file):
line = line.strip().decode('utf8')
if line and line[0] != u'#':
tokens = line.split('\t')
if u'-' not in tokens[0] and u'.' not in tokens[0]:
word = tokens[1].lower()
words_list.append(word2id[word] if word in word2id else 1)
chars_list.append([char2id[char] if char in char2id else 1 for char in word])
clusters_list.append(word2cluster[word] if word in word2cluster else 0)
upos,xpos = tokens[3:5]
upos_list.append(upos2id[upos] if upos in upos2id else 0)
xpos_list.append(xpos2id[xpos] if xpos in xpos2id else 0)
if line == '':
sent_words_list.append(words_list)
sent_chars_list.append(chars_list)
sent_clusters_list.append(clusters_list)
sent_xpos_list.append(xpos_list)
sent_upos_list.append(upos_list)
words_list = []
chars_list = []
clusters_list = []
upos_list = []
xpos_list = []
upos_word_dataset = Dataset(sent_words_list,sent_upos_list)
xpos_word_dataset = Dataset(sent_words_list,sent_xpos_list)
char_dataset = Dataset(sent_chars_list,sent_upos_list)
cluster_dataset = Dataset(sent_clusters_list,sent_upos_list)
return upos_word_dataset,xpos_word_dataset,char_dataset,cluster_dataset
评论列表
文章目录