def main():
r_pre = "[your file path]/all_purpose"
f_path = "[your file path]/all_purpose_export.txt"
p1 = r_pre + "\.csv\t\d+\t(.*?)(\t\d+){6}"
p2 = "(.*?)O\s*\t(.*?)"
extracted_combo_dct = {}
stemmed_extracted_combo_dct = {}
extracted_combo_lst = []
stemmed_extracted_combo_lst = []
n_top_words = 3
n_topics = 20
n_features = 50
f = open(f_path)
for l in f:
r1 = re.search(p1, l)
m1 = ' '.join(r1.group(1).split('\t'))
r2 = re.search(p2, l)
if r2 == None:
print l
break # used to add missing " O"
m2 = ' '.join([e for e in l.split(r2.group(1))[1].split('O')[1].split('\t') if e != ' ']).split('\n')[0]
extracted_combo_dct.setdefault(m1, 0)
stemmed_extracted_combo_dct.setdefault(m2, 0)
extracted_combo_dct[m1] += 1
stemmed_extracted_combo_dct[m2] += 1
extracted_combo_lst.append(m1)
stemmed_extracted_combo_lst.append(m2)
sort_dct_by_value(extracted_combo_dct)
sort_dct_by_value(stemmed_extracted_combo_dct)
n_samples = len(extracted_combo_lst)
n_stemmed_samples = len(stemmed_extracted_combo_lst)
# using NMF feature extraction
NMF_feature_extraction(extracted_combo_lst, n_samples, n_features, n_topics, n_top_words)
NMF_feature_extraction(stemmed_extracted_combo_lst, n_stemmed_samples, n_features, n_topics, n_top_words)
# using LDA feature extraction
LDA_feature_extraction(extracted_combo_lst, n_samples, n_features, n_topics, n_top_words)
LDA_feature_extraction(stemmed_extracted_combo_lst, n_stemmed_samples, n_features, n_topics, n_top_words)
评论列表
文章目录