def load_data_and_labels_semeval():
# load the entire semeval dataset
old_dataset = list(open("./input/2013-dev"))
old_dataset.extend(list(open("./input/2013-devtest")))
old_dataset.extend(list(open("./input/2013-train")))
old_dataset.extend(list(open("./input/2014-devtest")))
new_dataset = list(open("./input/2016-train"))
new_dataset.extend(list(open("./input/2016-dev")))
new_dataset.extend(list(open("./input/2016-devtest")))
# filter out invalid tweets from new dataset
new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n']
# generate x from old
tk = TweetTokenizer(reduce_len=True) # handles punctuations
x_text = [entry.split('\t')[3] for entry in old_dataset]
x_text = [clean_str(tweet) for tweet in x_text]
x_text = [tk.tokenize(tweet) for tweet in x_text]
# generate x from new
x_text_new = [entry.split('\t')[2] for entry in new_dataset]
x_text_new = [clean_str(tweet) for tweet in x_text_new]
x_text_new = [tk.tokenize(tweet) for tweet in x_text_new]
# concat x and x_new
x_text.extend(x_text_new)
# generate y from old
y = [entry.split('\t')[2] for entry in old_dataset]
for idx, label in enumerate(y):
if label == 'positive':
y[idx] = [1, 0, 0]
elif label == 'neutral':
y[idx] = [0, 1, 0]
elif label == 'negative':
y[idx] = [0, 0, 1]
else:
print 'wrong label in semeval: ' + label
# generate y from new
y_new = [entry.split('\t')[1] for entry in new_dataset]
for idx, label in enumerate(y_new):
if label == 'positive':
y_new[idx] = [1, 0, 0]
elif label == 'neutral':
y_new[idx] = [0, 1, 0]
elif label == 'negative':
y_new[idx] = [0, 0, 1]
else:
print 'wrong label in semeval: ' + label
# concat y and y_new
y.extend(y_new)
return [x_text, y]
评论列表
文章目录