def docs_to_networkx(dataset, cats, window_size=2, vocabulary_creation=True):
ds = './datasets/%s/' % dataset
Gs = []
labels = []
type_ = 2
vocab_creation = vocabulary_creation
words = [] # for vocabulary
for doc in os.listdir(ds):
if 'train.txt' in doc:
type_ = 1
if type_ == 1:
if os.path.exists("ds/vocab.txt"):
vocab_creation = False
with open(ds + '/train.txt', 'r', encoding='iso-8859-1') as doc:
dc = 1
for line in doc:
label = line[0]
labels.append(label)
terms = extract_terms_from_sentence(line[1:],
stopwords=stopwords.words('english'),
lemmatize=True,
stem=True,
only_N_J=True)
if vocab_creation:
words.extend(terms)
graph = terms_to_graph(terms, window_size)
G = graph_to_networkx(graph, name=label + '_' + str(dc))
# G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
nx.set_node_attributes(G, 'label', dict(zip(G.nodes(), G.nodes())))
Gs.append(G)
dc += 1
else:
if os.path.exists("ds/vocab.txt"):
vocab_creation = False
for cat in cats.keys():
for doc in os.listdir(ds + cat):
terms = extract_terms_from_file(ds + cat + '/' + doc,
stopwords=stopwords.words('english'),
lemmatize=True,
stem=True,
only_N_J=True)
if vocab_creation:
words.extend(terms)
graph = terms_to_graph(terms, window_size)
G = graph_to_networkx(graph, name=cat + doc.split('.')[0])
# G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
nx.set_node_attributes(G, name='label', values=dict(zip(G.nodes(), G.nodes())))
Gs.append(G)
labels.append(cats[cat])
if vocab_creation:
vocab = dict(Counter(words))
create_vocabulary_file(fname, vocab)
return Gs, labels
# needs fix or discard
graph_of_words.py 文件源码
python
阅读 20
收藏 0
点赞 0
评论 0
评论列表
文章目录