reverb_purpose_extraction.py 文件源码-python代码片段

def main():
    r_pre = "[your file path]/all_purpose"
    f_path = "[your file path]/all_purpose_export.txt"

    p1 = r_pre + "\.csv\t\d+\t(.*?)(\t\d+){6}"
    p2 = "(.*?)O\s*\t(.*?)"

    extracted_combo_dct = {}
    stemmed_extracted_combo_dct = {}
    extracted_combo_lst = []
    stemmed_extracted_combo_lst = []

    n_top_words = 3
    n_topics = 20
    n_features = 50


    f = open(f_path)
    for l in f:
        r1 = re.search(p1, l)
        m1 = ' '.join(r1.group(1).split('\t'))
        r2 = re.search(p2, l)
        if r2 == None:
            print l
            break    # used to add missing " O"
        m2 = ' '.join([e for e in l.split(r2.group(1))[1].split('O')[1].split('\t') if e != ' ']).split('\n')[0]

        extracted_combo_dct.setdefault(m1, 0)
        stemmed_extracted_combo_dct.setdefault(m2, 0)

        extracted_combo_dct[m1] += 1
        stemmed_extracted_combo_dct[m2] += 1

        extracted_combo_lst.append(m1)
        stemmed_extracted_combo_lst.append(m2)


    sort_dct_by_value(extracted_combo_dct)
    sort_dct_by_value(stemmed_extracted_combo_dct)

    n_samples = len(extracted_combo_lst)
    n_stemmed_samples = len(stemmed_extracted_combo_lst)

    # using NMF feature extraction
    NMF_feature_extraction(extracted_combo_lst, n_samples, n_features, n_topics, n_top_words)
    NMF_feature_extraction(stemmed_extracted_combo_lst, n_stemmed_samples, n_features, n_topics, n_top_words)

    # using LDA feature extraction
    LDA_feature_extraction(extracted_combo_lst, n_samples, n_features, n_topics, n_top_words)
    LDA_feature_extraction(stemmed_extracted_combo_lst, n_stemmed_samples, n_features, n_topics, n_top_words)