graph_of_words.py 文件源码-python代码片段

def docs_to_networkx(dataset, cats, window_size=2, vocabulary_creation=True):
    ds = './datasets/%s/' % dataset
    Gs = []
    labels = []
    type_ = 2
    vocab_creation = vocabulary_creation
    words = []  # for vocabulary

    for doc in os.listdir(ds):
        if 'train.txt' in doc:
            type_ = 1

    if type_ == 1:
        if os.path.exists("ds/vocab.txt"):
            vocab_creation = False
        with open(ds + '/train.txt', 'r', encoding='iso-8859-1') as doc:
            dc = 1
            for line in doc:
                label = line[0]
                labels.append(label)
                terms = extract_terms_from_sentence(line[1:],
                                                    stopwords=stopwords.words('english'),
                                                    lemmatize=True,
                                                    stem=True,
                                                    only_N_J=True)
                if vocab_creation:
                    words.extend(terms)
                graph = terms_to_graph(terms, window_size)
                G = graph_to_networkx(graph, name=label + '_' + str(dc))
                # G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
                nx.set_node_attributes(G, 'label', dict(zip(G.nodes(), G.nodes())))
                Gs.append(G)
                dc += 1
    else:
        if os.path.exists("ds/vocab.txt"):
            vocab_creation = False
        for cat in cats.keys():
            for doc in os.listdir(ds + cat):
                terms = extract_terms_from_file(ds + cat + '/' + doc,
                                                stopwords=stopwords.words('english'),
                                                lemmatize=True,
                                                stem=True,
                                                only_N_J=True)
                if vocab_creation:
                    words.extend(terms)
                graph = terms_to_graph(terms, window_size)
                G = graph_to_networkx(graph, name=cat + doc.split('.')[0])
                # G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
                nx.set_node_attributes(G, name='label', values=dict(zip(G.nodes(), G.nodes())))
                Gs.append(G)
                labels.append(cats[cat])

    if vocab_creation:
        vocab = dict(Counter(words))
        create_vocabulary_file(fname, vocab)

    return Gs, labels


# needs fix or discard