def test_homonym(H, sent, features, C=1.0):
X_0 = features(matching(sent, H[0]))
X_1 = features(matching(sent, H[1]))
y_0 = numpy.zeros(len(X_0))
y_1 = numpy.ones(len(X_1))
X = normalize(numpy.vstack([X_0, X_1]), norm='l2')
y = numpy.hstack([y_0, y_1])
classifier = LogisticRegression(C=C)
fold = StratifiedKFold(y, n_folds=10)
score = []
count = []
for tr, te in fold:
X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]
classifier.fit(X_tr, y_tr)
score.append(sum(classifier.predict(X_te) == y_te))
count.append(len(y_te))
score = numpy.array(score, dtype='float')
count = numpy.array(count, dtype='float')
result = {'word1_count': len(y_0),
'word2_count': len(y_1),
'majority': 1.0 * max(len(y_0),len(y_1))/len(y),
'kfold_acc': score/count }
return result
python类normalize()的实例源码
def make_tfidf(arr):
'''input, numpy array with flavor counts for each recipe and compounds
return numpy array adjusted as tfidf
'''
arr2 = arr.copy()
N=arr2.shape[0]
l2_rows = np.sqrt(np.sum(arr2**2, axis=1)).reshape(N, 1)
l2_rows[l2_rows==0]=1
arr2_norm = arr2/l2_rows
arr2_freq = np.sum(arr2_norm>0, axis=0)
arr2_idf = np.log(float(N+1) / (1.0 + arr2_freq)) + 1.0
from sklearn.preprocessing import normalize
tfidf = np.multiply(arr2_norm, arr2_idf)
tfidf = normalize(tfidf, norm='l2', axis=1)
print tfidf.shape
return tfidf
def flavor_profile(df,ingr,comp,ingr_comp):
sorted_ingredients = df.columns
underscore_ingredients=[]
for item in sorted_ingredients:
underscore_ingredients.append(item.replace(' ','_'))
print len(underscore_ingredients), len(sorted_ingredients)
ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
ingr_total = ingr_total.join(comp,how='right',on='compound id')
ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]
df_flavor = df.values.dot(ingr_flavor.values)
print df.shape, df_flavor.shape
return df_flavor
#normalize flavor matrix with tfidf method
def make_tfidf(arr):
'''input, numpy array with flavor counts for each recipe and compounds
return numpy array adjusted as tfidf
'''
arr2 = arr.copy()
N=arr2.shape[0]
l2_rows = np.sqrt(np.sum(arr2**2, axis=1)).reshape(N, 1)
l2_rows[l2_rows==0]=1
arr2_norm = arr2/l2_rows
arr2_freq = np.sum(arr2_norm>0, axis=0)
arr2_idf = np.log(float(N+1) / (1.0 + arr2_freq)) + 1.0
from sklearn.preprocessing import normalize
tfidf = np.multiply(arr2_norm, arr2_idf)
tfidf = normalize(tfidf, norm='l2', axis=1)
print tfidf.shape
return tfidf
def __init__(self, path, words=[], dim=300, normalize=True, **kwargs):
seen = []
vs = {}
for line in open(path):
split = line.split()
w = split[0]
if words == [] or w in words:
if len(split) != dim+1:
continue
seen.append(w)
vs[w] = np.array(map(float, split[1:]), dtype='float32')
self.iw = seen
self.wi = {w:i for i,w in enumerate(self.iw)}
self.m = np.vstack(vs[w] for w in self.iw)
if normalize:
self.normalize()
def get_subembed(self, word_list, normalize=False, restrict_context=True):
"""
Gets subembedding.
"""
w_set = set(self.iw)
valid_w = [word for word in word_list if word in w_set]
new_w_indices = np.array([self.wi[word] for word in valid_w])
if restrict_context:
c_set = set(self.ic)
valid_c = [word for word in word_list if word in c_set]
new_c_indices = np.array([self.ci[word] for word in valid_c])
new_m = self.m[new_w_indices, :]
new_m = new_m[:, new_c_indices]
else:
valid_c = self.ic
new_m = self.m[new_w_indices, :]
return Explicit(new_m, valid_w, valid_c, normalize=normalize)
def get_local_words(preds, vocab, NEs=[], k=50):
"""
given the word probabilities over many coordinates,
first normalize the probability of each word in different
locations to get a probability distribution, then compute
the entropy of the word's distribution over all coordinates
and return the words that are low entropy and are not
named entities.
"""
#normalize the probabilites of each vocab using entropy
normalized_preds = normalize(preds, norm='l1', axis=0)
entropies = stats.entropy(normalized_preds)
sorted_indices = np.argsort(entropies)
sorted_local_words = np.array(vocab)[sorted_indices].tolist()
filtered_local_words = []
NEset = set(NEs)
for word in sorted_local_words:
if word in NEset: continue
filtered_local_words.append(word)
return filtered_local_words[0:k]
DSC-Net-L2-EYaleB.py 文件源码
项目:Deep-subspace-clustering-networks
作者: panji1990
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def post_proC(C, K, d, alpha):
# C: coefficient matrix, K: number of clusters, d: dimension of each subspace
C = 0.5*(C + C.T)
r = d*K + 1
U, S, _ = svds(C,r,v0 = np.ones(C.shape[0]))
U = U[:,::-1]
S = np.sqrt(S[::-1])
S = np.diag(S)
U = U.dot(S)
U = normalize(U, norm='l2', axis = 1)
Z = U.dot(U.T)
Z = Z * (Z>0)
L = np.abs(Z ** alpha)
L = L/L.max()
L = 0.5 * (L + L.T)
spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize')
spectral.fit(L)
grp = spectral.fit_predict(L) + 1
return grp, L
DSC-Net-L2-ORL.py 文件源码
项目:Deep-subspace-clustering-networks
作者: panji1990
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def post_proC(C, K, d, alpha):
# C: coefficient matrix, K: number of clusters, d: dimension of each subspace
C = 0.5*(C + C.T)
r = min(d*K + 1, C.shape[0]-1)
U, S, _ = svds(C,r,v0 = np.ones(C.shape[0]))
U = U[:,::-1]
S = np.sqrt(S[::-1])
S = np.diag(S)
U = U.dot(S)
U = normalize(U, norm='l2', axis = 1)
Z = U.dot(U.T)
Z = Z * (Z>0)
L = np.abs(Z ** alpha)
L = L/L.max()
L = 0.5 * (L + L.T)
spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize')
spectral.fit(L)
grp = spectral.fit_predict(L) + 1
return grp, L
DSC-Net-L2-COIL20.py 文件源码
项目:Deep-subspace-clustering-networks
作者: panji1990
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def post_proC(C, K, d, alpha):
# C: coefficient matrix, K: number of clusters, d: dimension of each subspace
n = C.shape[0]
C = 0.5*(C + C.T)
C = C - np.diag(np.diag(C)) + np.eye(n,n) # for sparse C, this step will make the algorithm more numerically stable
r = d*K + 1
U, S, _ = svds(C,r,v0 = np.ones(n))
U = U[:,::-1]
S = np.sqrt(S[::-1])
S = np.diag(S)
U = U.dot(S)
U = normalize(U, norm='l2', axis = 1)
Z = U.dot(U.T)
Z = Z * (Z>0)
L = np.abs(Z ** alpha)
L = L/L.max()
L = 0.5 * (L + L.T)
spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed', assign_labels='discretize')
spectral.fit(L)
grp = spectral.fit_predict(L) + 1
return grp, L
DSC-Net-L2-COIL100.py 文件源码
项目:Deep-subspace-clustering-networks
作者: panji1990
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def post_proC(C, K, d, alpha):
# C: coefficient matrix, K: number of clusters, d: dimension of each subspace
C = 0.5*(C + C.T)
r = d*K + 1
U, S, _ = svds(C,r,v0 = np.ones(C.shape[0]))
U = U[:,::-1]
S = np.sqrt(S[::-1])
S = np.diag(S)
U = U.dot(S)
U = normalize(U, norm='l2', axis = 1)
Z = U.dot(U.T)
Z = Z * (Z>0)
L = np.abs(Z ** alpha)
L = L/L.max()
L = 0.5 * (L + L.T)
spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize')
spectral.fit(L)
grp = spectral.fit_predict(L) + 1
return grp, L
def rede_neural(X, y):
print("Iniciando treinamento da Rede Neural")
X2 = normalize(X)
clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', algorithm='adam', alpha=1e-5,
learning_rate='constant',tol=1e-8,learning_rate_init=0.0002,
early_stopping=True,validation_fraction=0.2)
kf = KFold(len(y),n_folds=3)
i = 0
for train,test in kf:
start = time.time()
i = i + 1
print("Treinamento",i)
# dividindo dataset em treino e test
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1)
X_train, X_test, y_train, y_test = X2[train], X2[test], y[train], y[test]
# fit
clf.fit(X_train, y_train)
print("score:",clf.score(X_test, y_test),"(",(time.time()-start)/60.0,"minutos )")
return clf
def vectorize(features, vocab):
""" Transform a features list into a numeric vector
with a given vocab
:type dpvocab: dict
:param dpvocab: vocab for distributional representation
:type projmat: scipy.lil_matrix
:param projmat: projection matrix for disrep
"""
vec = lil_matrix((1, len(vocab)))
for feat in features:
try:
fidx = vocab[feat]
vec[0, fidx] += 1.0
except KeyError:
pass
# Normalization
vec = normalize(vec)
return vec
def strip_accents_unicode(s):
"""Transform accentuated unicode symbols into their simple counterpart
Warning: the python-level loop and join operations make this
implementation 20 times slower than the strip_accents_ascii basic
normalization.
See also
--------
strip_accents_ascii
Remove accentuated char for any unicode symbol that has a direct
ASCII equivalent.
"""
normalized = unicodedata.normalize('NFKD', s)
if normalized == s:
return s
else:
return ''.join([c for c in normalized if not unicodedata.combining(c)])
def _char_wb_ngrams(self, text_document):
"""Whitespace sensitive char-n-gram tokenization.
Tokenize text_document into a sequence of character n-grams
excluding any whitespace (operating only inside word boundaries)"""
# normalize white spaces
text_document = self._white_spaces.sub(" ", text_document)
min_n, max_n = self.ngram_range
ngrams = []
for w in text_document.split():
w = ' ' + w + ' '
w_len = len(w)
for n in xrange(min_n, max_n + 1):
offset = 0
ngrams.append(w[offset:offset + n])
while offset + n < w_len:
offset += 1
ngrams.append(w[offset:offset + n])
if offset == 0: # count a short word (w_len < n) only once
break
return ngrams
def strip_accents_unicode(s):
"""Transform accentuated unicode symbols into their simple counterpart
Warning: the python-level loop and join operations make this
implementation 20 times slower than the strip_accents_ascii basic
normalization.
See also
--------
strip_accents_ascii
Remove accentuated char for any unicode symbol that has a direct
ASCII equivalent.
"""
normalized = unicodedata.normalize('NFKD', s)
if normalized == s:
return s
else:
return ''.join([c for c in normalized if not unicodedata.combining(c)])
def _char_wb_ngrams(self, text_document):
"""Whitespace sensitive char-n-gram tokenization.
Tokenize text_document into a sequence of character n-grams
excluding any whitespace (operating only inside word boundaries)"""
# normalize white spaces
text_document = self._white_spaces.sub(" ", text_document)
min_n, max_n = self.ngram_range
ngrams = []
for w in text_document.split():
w = ' ' + w + ' '
w_len = len(w)
for n in xrange(min_n, max_n + 1):
offset = 0
ngrams.append(w[offset:offset + n])
while offset + n < w_len:
offset += 1
ngrams.append(w[offset:offset + n])
if offset == 0: # count a short word (w_len < n) only once
break
return ngrams
def fit(self, X_raw, y=None):
cents = self.vect.fit_transform(X_raw)
# print("Largest singular value: {:.2f}".format(
# np.linalg.norm(cents, ord=2)))
# cents = all_but_the_top(cents, 1)
# print("Largest singular value: {:.2f}".format(
# np.linalg.norm(cents, ord=2)))
# print("Renormalizing")
# normalize(cents, copy=False)
# print("Largest singular value: {:.2f}".format(
# np.linalg.norm(cents, ord=2)))
self.centroids = cents
print(' FIT centroids shape', self.centroids.shape)
self._y = y
if self.matching:
self.matching.fit(X_raw)
else:
self.nn.fit(cents)
def test_lsi():
cache_dir = check_cache()
n_components = 2
fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup()
fe.ingest(data_dir, file_pattern='.*\d.txt')
lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
lsi_res, exp_var = lsi.fit_transform(n_components=n_components, alpha=1.0)
assert lsi_res.components_.shape[0] == 5
assert lsi_res.components_.shape[1] == fe.n_features_
assert lsi._load_pars() is not None
lsi._load_model()
X_lsi = lsi._load_features()
assert_allclose(normalize(X_lsi), X_lsi)
lsi.list_models()
lsi.delete()
def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing):
cache_dir = check_cache()
use_hashing = (use_hashing == 'hashed')
fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(analyzer=analyzer, ngram_range=ngram_range,
use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
res2 = fe._load_features(uuid)
assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)
assert np.isfinite(res2.data).all()
assert_allclose(normalize(res2).data, res2.data) # data is l2 normalized
fe.delete()
def test_feature_extraction_weighting(weighting,
use_hashing):
cache_dir = check_cache()
use_hashing = (use_hashing == 'hashed')
fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(weighting=weighting, use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
res2 = fe._load_features(uuid)
assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), \
"not an array {}".format(res2)
assert np.isfinite(res2.data).all()
assert_allclose(normalize(res2).data, res2.data) # data is l2 normalized
fe.delete()
def load_pretrained():
#glove_vec = ["glove_wiki_50","glove_wiki_150","glove_wiki_300"]
glove_vec = ["glove_wiki_300"]
#glove_vec = ["glove_wiki_50"]
filename = 'glove_pretrained.h5'
#import tensorflow as tf
#sess = tf.InteractiveSession()
features, words = load_h5py('glove_wiki_300',filename=root + glove_vec_fold + filename)
filename = 'glove.h5'
features = normalize(np.array(features), axis=1, norm='l2')
with h5py.File(root + glove_vec_fold + filename, "w") as hf:
hf.create_dataset(glove_vec[0], data=features)
string_dt = h5py.special_dtype(vlen=str)
hf.create_dataset(glove_vec[0] + "_words", data=words, dtype=string_dt)
for vec in glove_vec:
data, words = load_h5py(vec, filename=root + glove_vec_fold + "glove.h5")
print(data.shape, words.shape)
time.sleep(5)
def testWord2Vec(testWords,weights,num_display=3):
##Generate inverse word mapping for easy lookup
invWordDict = {v: k for k, v in wordDict.iteritems()}
## Normalize the trained weights for cosine similarity
trainedWeights = normalize(weights,norm = 'l2', axis = 1)
for word in testWords:
try:
embedding = trainedWeights[wordDict[word],:]
prox = np.argsort(np.dot(embedding,trainedWeights.transpose())/np.linalg.norm(embedding))[-num_display:].tolist()
prox.reverse()
print 'Closest word vector (by cosine similarity) for %s : '%word, [invWordDict[item] for item in prox]
except KeyError:
print '"%s" not found in the Trained Word Embeddings. Skipping...'%word
pass
def testWord2Vec(word_list,weights,num_display=3):
##Generate inverse word mapping for easy lookup
invWordDict = {v: k for k, v in wordDict.iteritems()}
## Normalize the trained weights for cosine similarity
trainedWeights = normalize(weights,norm = 'l2', axis = 1)
for word in word_list:
try:
embedding = trainedWeights[wordDict[word],:]
prox = np.argsort(np.dot(embedding,trainedWeights.transpose())/np.linalg.norm(embedding))[-num_display:].tolist()
prox.reverse()
print 'Closest word vector (by cosine similarity) for %s : '%word, [invWordDict[item] for item in prox]
except KeyError:
print '"%s" not found in the Trained Word Embeddings. Skipping...'%word
pass
def trainingPCA(features, n_components=256, whiten=True, pca_model_name=None):
print 'loaded features! {}'.format(features.shape)
print np.sqrt(sum(features[0,:]**2))
#print 'Features l2 normalization'
#features = normalize(features)
#print np.sqrt(sum(features[0,:]**2))
print 'Feature PCA-whitenning'
pca_model = PCA(n_components=n_components, whiten=whiten)
features = pca_model.fit_transform(features)
print np.sqrt(sum(features[0,:]**2))
print 'Features l2 normalization'
features = normalize(features)
print np.sqrt(sum(features[0,:]**2))
if pca_model_name is not None:
print 'saving model...'
check_path_file(pca_model_name, create_if_missing=True)
save_obj(pca_model, pca_model_name)
print 'done! {}'.format(pca_model_name)
return pca_model
def gen_network_matrix(num_nodes, net_df, node1, node2, weight, node2index):
"""Generates network adjacency matrix and normalizes it"""
# Transform the first two columns of the DataFrame -- the nodes -- to their indexes
net_df[node1] = net_df[node1].apply(lambda x: node2index[x])
net_df[node2] = net_df[node2].apply(lambda x: node2index[x])
# Create the sparse matrix
network_matrix = sparse.csr_matrix((net_df[weight].values, (net_df[node1].values, net_df[node2].values)),
shape=(num_nodes, num_nodes), dtype=float)
# Make the ajdacency matrix symmetric
network_matrix = (network_matrix + network_matrix.T)
network_matrix.setdiag(0)
# Normalize the rows of network_matrix because we are multiplying vector by matrix (from left)
network_matrix = normalize(network_matrix, norm='l1', axis=1)
return(net_df, network_matrix)
###############################################################################
def gen_network_matrix(num_nodes, net_df, node1, node2, weight, node2index):
"""Generates network adjacency matrix and normalizes it"""
# Transform the first two columns of the DataFrame -- the nodes -- to their indexes
net_df[node1] = net_df[node1].apply(lambda x: node2index[x])
net_df[node2] = net_df[node2].apply(lambda x: node2index[x])
# Create the sparse matrix
network_matrix = sparse.csr_matrix((net_df[weight].values, (net_df[node1].values, net_df[node2].values)),
shape=(num_nodes, num_nodes), dtype=float)
# Make the ajdacency matrix symmetric
network_matrix = (network_matrix + network_matrix.T)
network_matrix.setdiag(0)
# Normalize the rows of network_matrix because we are multiplying vector by matrix (from left)
network_matrix = normalize(network_matrix, norm='l1', axis=1)
return(net_df, network_matrix)
###############################################################################
def get_Temporal_Network(edges,firstday,lastday,directed,number_of_nodes,normalized):
# Dictionary indexed by times from 0 to firstday-lastday: time: edge_list
time_to_edges = {t: set() for t in xrange(0, lastday-firstday+1)}
for u,v,t in edges:
if u != v: # ignore self loops
time_to_edges[t - firstday].add((u,v))
if not directed:
time_to_edges[t - firstday].add((v,u))
# Initialize the temporal network
Temporal_Network = {}
for time, edges in time_to_edges.items():
col = [u for u,v in edges]
row = [v for u,v in edges]
dat = [True for i in range(len(edges))]
Adj_Matrix = sp.csr_matrix((dat,(row,col)),
shape=(number_of_nodes, number_of_nodes), dtype=bool)
# !!!!!!!!! Annahme, dass Kante: u -> v und p(t+1) = Ap(t) bzw. A[v,u] = 1 !!!!!!!!
if normalized:
Adj_Matrix = normalize(Adj_Matrix.transpose(), norm='l1', axis=1, copy=False).transpose()
Temporal_Network[time] = Adj_Matrix
else:
Temporal_Network[time] = Adj_Matrix
return Temporal_Network
def main(test, base, align, project, r):
outdir = os.path.join(os.getcwd(), project)
tmp_dir = os.path.join(outdir, 'tmp.{}'.format(project))
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
print('temporary dir: {}'.format(tmp_dir))
basedWordVectors, testedWordVectors, aligned_test, subsetTest = \
align_vec(base, test, align, tmp_dir)
test_cols = len(testedWordVectors)
base_cols = len(basedWordVectors)
print('normalizing matrix')
baseX = preprocessing.normalize(dict_to_matrix(basedWordVectors))
testX = preprocessing.normalize(dict_to_matrix(testedWordVectors))
aligned_testX = preprocessing.normalize(dict_to_matrix(aligned_test))
subtestX = preprocessing.normalize(dict_to_matrix(subsetTest))
cca = CCA(n_components=200)
print('computing CCA')
cca.fit(subtestX, aligned_testX)
ccaed_test = trans(testX, cca.x_weights_)
ccaed_base = trans(baseX, cca.y_weights_)
output(outdir, test, ccaed_test, testedWordVectors)
output(outdir, base, ccaed_base, basedWordVectors)
def strip_accents_unicode(s):
"""Transform accentuated unicode symbols into their simple counterpart
Warning: the python-level loop and join operations make this
implementation 20 times slower than the strip_accents_ascii basic
normalization.
See also
--------
strip_accents_ascii
Remove accentuated char for any unicode symbol that has a direct
ASCII equivalent.
"""
normalized = unicodedata.normalize('NFKD', s)
if normalized == s:
return s
else:
return ''.join([c for c in normalized if not unicodedata.combining(c)])