def main(dataset = 'proton-beam-xml'):
csv.field_size_limit(430000)
global mat, rel, turk_dic
if dataset == 'proton-beam-xml':
pub_dic_tmp = get_pub_dic_xml()
# pub_dic_items are already sorted by key
[rec_nums, texts] = zip(*pub_dic.items())
rel = get_relevant()
else:
pub_dic_tmp = get_pub_dic_csv(dataset)
#[rec_nums, texts] = zip(*pub_dic.items())
(turk_dic_tmp, rel_dic_tmp) = get_turk_data(dataset)
texts = []
pub_dic = {}; turk_dic = {}; rel_dic = {}
for i in sorted(pub_dic_tmp.keys()):
if pub_dic_tmp.has_key(i) and turk_dic_tmp.has_key(i) and rel_dic_tmp.has_key(i):
texts.append(pub_dic_tmp[i])
pub_dic[i] = pub_dic_tmp[i]
turk_dic[i] = turk_dic_tmp[i]
rel_dic[i] = rel_dic_tmp[i]
#else:
# if pub_dic.has_key(i): pub_dic.pop(i)
# if turk_dic.has_key(i): turk_dic.pop(i)
# if rel_dic.has_key(i): rel_dic.pop(i)
(_,rel) = zip(*sorted(rel_dic.items()))
rel = map(int, rel)
vectorizer = TfidfVectorizer()
#save_texts = texts
mat = vectorizer.fit_transform(texts)
return (pub_dic, texts)
评论列表
文章目录