def plot_MDS(*data):
'''
graph after MDS
:param data: train_data, train_value
:return: None
'''
X,y=data
mds=manifold.MDS(n_components=2)
X_r=mds.fit_transform(X)
### graph
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
(0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
for label ,color in zip( np.unique(y),colors):
position=y==label
ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label),color=color)
ax.set_xlabel("X[0]")
ax.set_ylabel("X[1]")
ax.legend(loc="best")
ax.set_title("MDS")
plt.show()
python类MDS的实例源码
def apply_lens(df, lens='pca', dist='euclidean', n_dim=2, **kwargs):
"""
input: N x F dataframe of observations
output: N x n_dim image of input data under lens function
"""
if n_dim != 2:
raise 'error: image of data set must be two-dimensional'
if dist not in ['euclidean', 'correlation']:
raise 'error: only euclidean and correlation distance metrics are supported'
if lens == 'pca' and dist != 'euclidean':
raise 'error: PCA requires the use of euclidean distance metric'
if lens == 'pca':
df_lens = pd.DataFrame(decomposition.PCA(n_components=n_dim, **kwargs).fit_transform(df), df.index)
elif lens == 'mds':
D = metrics.pairwise.pairwise_distances(df, metric=dist)
df_lens = pd.DataFrame(manifold.MDS(n_components=n_dim, **kwargs).fit_transform(D), df.index)
elif lens == 'neighbor':
D = metrics.pairwise.pairwise_distances(df, metric=dist)
df_lens = pd.DataFrame(manifold.SpectralEmbedding(n_components=n_dim, **kwargs).fit_transform(D), df.index)
else:
raise 'error: only PCA, MDS, neighborhood lenses are supported'
return df_lens
def infer_clusters(contactMat, clusters, offsets, alpha, classical=False):
"""Infers 3D coordinates for multiple clusters with same contact matrix"""
assert sum([len(cluster.getPointNums()) for cluster in clusters]) == len(contactMat)
at.makeSymmetric(contactMat)
rowsums = np.array([sum(row) for row in contactMat])
assert len(np.where(rowsums == 0)[0]) == 0
distMat = at.contactToDist(contactMat, alpha)
at.makeSymmetric(distMat)
if classical: #classical MDS
coords = st.cmds(distMat)
else:
mds = manifold.MDS(n_components=3, metric=True, random_state=np.random.RandomState(), verbose=0, dissimilarity="precomputed", n_jobs=-1)
coords = mds.fit_transform(distMat)
for offset, cluster in zip(offsets, clusters):
for i in range(len(cluster.getPoints())):
cluster.getPoints()[i].pos = coords[i + offset]
def infer_cluster(contactMat, cluster, alpha, classical=False):
"""Infers 3D coordinates for one cluster"""
assert len(cluster.getPointNums()) == len(contactMat)
at.makeSymmetric(contactMat)
rowsums = np.array([sum(row) for row in contactMat])
assert len(np.where(rowsums == 0)[0]) == 0
distMat = at.contactToDist(contactMat, alpha)
at.makeSymmetric(distMat)
if classical: #classical MDS
coords = st.cmds(distMat)
else:
mds = manifold.MDS(n_components=3, metric=True, random_state=np.random.RandomState(), verbose=0, dissimilarity="precomputed", n_jobs=-1)
coords = mds.fit_transform(distMat)
for i in range(len(cluster.getPoints())):
cluster.getPoints()[i].pos = coords[i]
def fullMDS(path, classical, alpha):
"""MDS without partitioning"""
cluster = dt.clusterFromBed(path, None, None)
contactMat = dt.matFromBed(path, cluster)
infer_cluster(contactMat, cluster, alpha, classical)
return cluster
def visualize_tweets(W, topic_number, color):
'''
INPUT
- W matrix of observations
- topic_number - this is the number of the topic to be checked
- color - this is the color to be used in creating the scatterplot
OUTPUT
- a scatter plot of the relative location of the different topics
from each other in a flattened space using multidimensional scaling
Returns none
'''
# mds = MDS(n_jobs=-1)
topic_list = np.apply_along_axis(np.argmax, 1, W)
Wsubset = W[topic_list == topic_number]
pca = PCA(n_components=2)
pca = PCA(n_components=2)
hflat = pca.fit_transform(Wsubset)
plt.scatter(hflat[:, 0], hflat[:, 1], color=color, alpha=.1)
plt.title('these are the {} tweets in topic # {}'.format(Wsubset.shape[0],
topic_number+1))
# plt.show()
def embedding(vi_mat,LL,n_neighbors=10):
n_components=2
Y = manifold.MDS(n_components,dissimilarity='precomputed').fit_transform(vi_mat)
color=np.zeros(1000)
color[:6]=np.ones(6)
#~ plt.figure()
#~ plt.plot(Y[:, 0], Y[:, 1], 'k.')
#~ plt.plot(Y[-n_close:, 0], Y[-n_close:, 1], 'r.')
#~ for i in xrange(6):
#~ plt.plot(Y[i, 0], Y[i, 1], 'bo',ms=3+3*i)
#~ plt.scatter(Y[:, 0], Y[:, 1], c=LL)
fig=plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Y[:, 0], Y[:, 1], LL[:,0], c=LL[:,0], marker='o')
fig=plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Y[:, 0], Y[:, 1], LL[:,1], c=LL[:,1], marker='o')
return Y
#################################################################
#load known partitions
def do_embedding(self, event=None):
converted = self.parent.converted
if converted is None:
#self.conversion.convert_frames()
self.parent.converted = np.load(self.parent.output_folder+'/converted.npy') #FIXME For debugging
converted = self.parent.converted
method_ind = self.method.currentIndex()
print('Doing %s' % self.method.currentText())
if method_ind == 0:
self.embedder = manifold.SpectralEmbedding(n_components=4, n_jobs=-1)
elif method_ind == 1:
self.embedder = manifold.Isomap(n_components=4, n_jobs=-1)
elif method_ind == 2:
self.embedder = manifold.LocallyLinearEmbedding(n_components=4, n_jobs=-1, n_neighbors=20, method='modified')
elif method_ind == 3:
self.embedder = manifold.LocallyLinearEmbedding(n_components=4, n_jobs=-1, n_neighbors=20, method='hessian', eigen_solver='dense')
elif method_ind == 4:
self.embedder = manifold.MDS(n_components=4, n_jobs=-1)
elif method_ind == 5:
self.embedder = manifold.TSNE(n_components=3, init='pca')
self.embedder.fit(converted)
self.embed = self.embedder.embedding_
self.embed_plot = self.embed
self.gen_hist()
self.plot_embedding()
if not self.embedded:
self.add_classes_frame()
self.embedded = True
def index(request):
if 'model' not in request.session:
return HttpResponseRedirect(URL_PREFIX + '/')
template = loader.get_template('conceptualiser.html')
lexicons = []
for lexicon in Lexicon.objects.all().filter(author=request.user):
setattr(lexicon,'size',Word.objects.all().filter(lexicon=lexicon.id).count())
lexicons.append(lexicon)
methods = ["PCA","TSNE","MDS"]
return HttpResponse(template.render({'STATIC_URL':STATIC_URL,'lexicons':lexicons,'methods':methods},request))
def encode(self, data, metric='euclidean'):
""" Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.
Parameters
----------
data : real array-like, shape(n_samples, n_features)
Data matrix, each row represents a sample.
metric : string
One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html.
Valid options include:
- euclidean
- cityblock
- l1
- cosine
Returns
-------
encoded_data : real array-like, shape(n_samples, n_features)
``data``, as represented by the prototypes in codebook.
ts_symbols : list, shape(n_samples, 1)
A discrete symbolic time series
"""
# Perform a proposed data mining procedure as described in [Laskaris2004].
mds = MDS(1, random_state=self.rng)
protos_1d = mds.fit_transform(self.protos).ravel()
sorted_protos_1d = np.argsort(protos_1d)
sprotos = self.protos[sorted_protos_1d]
nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric=metric).fit(sprotos)
_, self.__symbols = nbrs.kneighbors(data)
self.__encoding = sprotos[self.__symbols]
return (self.__encoding, self.__symbols)
demo_mds.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def plot_demo_1():
X = np.c_[np.ones(5), 2 * np.ones(5), 10 * np.ones(5)].T
y = np.array([0, 1, 2])
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white')
mds = manifold.MDS(n_components=3)
Xtrans = mds.fit_transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on example data set in 3 dimensions")
ax.view_init(10, -15)
mds = manifold.MDS(n_components=2)
Xtrans = mds.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on example data set in 2 dimensions")
filename = "mds_demo_1.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def main():
parser = argparse.ArgumentParser(description="Reconstruct 3D coordinates from normalized intrachromosomal Hi-C BED files.")
parser.add_argument("path", help="path to intrachromosomal Hi-C BED file")
parser.add_argument("--classical", action="store_true", help="use classical MDS (default: metric MDS)")
parser.add_argument("-l", help="path to low-resolution intrachromosomal Hi-C BED file")
parser.add_argument("-p", type=float, default=0.1, help="domain size parameter: larger value means fewer clusters created (for partitioned MDS only)")
parser.add_argument("-m", type=float, default=0.05, help="minimum domain size parameter: prevents clusters from being too small (for partitioned MDS only)")
parser.add_argument("-o", help="path to output file")
parser.add_argument("-r", default=32000000, help="maximum RAM to use (in kb)")
parser.add_argument("-n", default=3, help="number of threads")
parser.add_argument("-a", type=float, default=4, help="alpha factor for converting contact frequencies to physical distances")
args = parser.parse_args()
if args.l is None: #not partitioned
cluster = fullMDS(args.path, args.classical, args.a)
else: #partitioned
params = (args.p, args.m, args.r, args.n, args.a)
names = ("Domain size parameter", "Minimum domain size", "Maximum memory", "Number of threads", "Alpha")
intervals = ((0,1), (0,1), (0, None), (0, None), (1, None))
if not tools.args_are_valid(params, names, intervals):
sys.exit(0)
cluster = partitionedMDS(args.path, args.l, params)
if args.o is not None:
cluster.write(args.o)
def mds(dataset, labels, attNames, **kwargs):
mds = manifold.MDS(n_components=2, max_iter=300)
trained = mds.fit_transform(dataset)
plot(trained, labels, attNames, **kwargs)
# Assignment 2
def mds_variance_explained(corrmat, mds_coords):
"""Determine how much variance is explained by projection onto MDS coords."""
orig_dist = (1 - corrmat)[np.triu_indices_from(corrmat, 1)]
mds_dist = distance.pdist(mds_coords)
r, _ = stats.pearsonr(orig_dist, mds_dist)
return r ** 2
def get_twodim_reps(reps, seed, distance=euclidean_distances):
reps = reps.astype(np.float64)
similarities = distance(reps)
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=seed)
return mds.fit(similarities).embedding_
def visualize_topics(H):
'''
INPUT
- H matrix of topics
OUTPUT
- a scatter plot of the relative location of the different topics
from each other in a flattened space using PCA
- color_list - the list of colors to be used in the next
visualizations of the tweets
Returns the color list
'''
mds = MDS(n_jobs=-1)
# pca = PCA(n_components=2)
# hflat = pca.fit_transform(H)
hflat = mds.fit_transform(H)
# colors = cm.rainbow(hflat.shape[0]-1)
colors = cycle(["r", "b", "g", "c", "m", "y", "k", "w"])
color_list = []
for i, row in enumerate(hflat):
color = next(colors)
plt.scatter(row[0], row[1],
label='topic number {}'.format(i+1), color=color)
color_list.append(color)
plt.legend(loc='best')
plt.show()
return color_list, mds
def test_MDS(*data):
'''
test MDS method
:param data: train_data, train_value
:return: None
'''
X,y=data
for n in [4,3,2,1]:
mds=manifold.MDS(n_components=n)
mds.fit(X)
print('stress(n_components={0}) : {1}'.format (n, str(mds.stress_)))
def smacof_mds(C, dim, max_iter=3000, eps=1e-9):
"""
Returns an interpolated point cloud following the dissimilarity matrix C
using SMACOF multidimensional scaling (MDS) in specific dimensionned
target space
Parameters
----------
C : ndarray, shape (ns, ns)
dissimilarity matrix
dim : int
dimension of the targeted space
max_iter : int
Maximum number of iterations of the SMACOF algorithm for a single run
eps : float
relative tolerance w.r.t stress to declare converge
Returns
-------
npos : ndarray, shape (R, dim)
Embedded coordinates of the interpolated point cloud (defined with
one isometry)
"""
rng = np.random.RandomState(seed=3)
mds = manifold.MDS(
dim,
max_iter=max_iter,
eps=1e-9,
dissimilarity='precomputed',
n_init=1)
pos = mds.fit(C).embedding_
nmds = manifold.MDS(
2,
max_iter=max_iter,
eps=1e-9,
dissimilarity="precomputed",
random_state=rng,
n_init=1)
npos = nmds.fit_transform(C, init=pos)
return npos
##############################################################################
# Data preparation
# ----------------
#
# The four distributions are constructed from 4 simple images
def smacof_mds(C, dim, max_iter=3000, eps=1e-9):
"""
Returns an interpolated point cloud following the dissimilarity matrix C
using SMACOF multidimensional scaling (MDS) in specific dimensionned
target space
Parameters
----------
C : ndarray, shape (ns, ns)
dissimilarity matrix
dim : int
dimension of the targeted space
max_iter : int
Maximum number of iterations of the SMACOF algorithm for a single run
eps : float
relative tolerance w.r.t stress to declare converge
Returns
-------
npos : ndarray, shape (R, dim)
Embedded coordinates of the interpolated point cloud (defined with
one isometry)
"""
rng = np.random.RandomState(seed=3)
mds = manifold.MDS(
dim,
max_iter=max_iter,
eps=1e-9,
dissimilarity='precomputed',
n_init=1)
pos = mds.fit(C).embedding_
nmds = manifold.MDS(
2,
max_iter=max_iter,
eps=1e-9,
dissimilarity="precomputed",
random_state=rng,
n_init=1)
npos = nmds.fit_transform(C, init=pos)
return npos
##############################################################################
# Data preparation
# ----------------
#
# The four distributions are constructed from 4 simple images
def load_terms(request):
lexicon_ids = json.loads(request.POST['lids'])
try:
model = model_manager.get_model(request.session['model']).model
except LookupError as e:
return HttpResponseRedirect(URL_PREFIX + '/')
if model.wv.syn0norm is None:
model.init_sims()
words = [word for word in Word.objects.filter(lexicon__id__in = lexicon_ids) if word.wrd.encode('utf-8') in model.wv.vocab]
feature_vectors = [model.wv.syn0norm[model.wv.vocab[word.wrd.encode('utf-8')].index] for word in words]
output = {'terms':[],'concepts':[]}
if len(feature_vectors):
X = np.array(feature_vectors)
if request.POST['method'] == 'TSNE':
transformer = TSNE(n_components=2, random_state=0,metric='cosine',learning_rate=50)
elif request.POST['method'] == 'MDS':
transformer = MDS(n_components=2, max_iter=600,dissimilarity="precomputed", n_jobs=1)
X = pairwise_distances(X,metric='cosine',n_jobs=1)
else:
transformer = PCA(n_components=2)
transformed_feature_vectors = transformer.fit_transform(X).tolist()
terms = []
concepts = {}
for i in range(len(words)):
term = {'id':words[i].id,'term':words[i].wrd,'count':model.wv.vocab[words[i].wrd.encode('utf-8')].count,'x':transformed_feature_vectors[i][0] if len(feature_vectors) > 1 else 0,'y':transformed_feature_vectors[i][1] if len(feature_vectors) > 1 else 0}
term_concepts = TermConcept.objects.filter(term__term = words[i].wrd).filter(concept__author = request.user)
if term_concepts:
concept_id = term_concepts[0].concept.id
descriptive_term = term_concepts[0].concept.descriptive_term.term
descriptive_term_id = term_concepts[0].concept.descriptive_term.id
if concept_id not in concepts:
concepts[concept_id] = {'id':concept_id,'terms':[],'descriptive_term':descriptive_term,'descriptive_term_id':Word.objects.filter(wrd=descriptive_term)[0].id}
concepts[concept_id]['terms'].append(term)
else:
terms.append(term)
output['terms'].extend(terms)
output['concepts'].extend([concepts[concept_id] for concept_id in concepts])
logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'CREATE CONCEPTS','event':'terms_loaded','args':{'user_name':request.user.username,'lexicon_ids':lexicon_ids,'dim_red_method':request.POST['method']}}))
else:
logging.getLogger(INFO_LOGGER).warning(json.dumps({'process':'CREATE CONCEPTS','event':'term_loading_failed','args':{'user_name':request.user.username,'lexicon_ids':lexicon_ids,'dim_red_method':request.POST['method']},'reason':'No terms to load.'}))
return HttpResponse(json.dumps(output), content_type='application/json')
demo_mds.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def plot_iris_mds():
iris = datasets.load_iris()
X = iris.data
y = iris.target
# MDS
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white')
mds = manifold.MDS(n_components=3)
Xtrans = mds.fit_transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on Iris data set in 3 dimensions")
ax.view_init(10, -15)
mds = manifold.MDS(n_components=2)
Xtrans = mds.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on Iris data set in 2 dimensions")
filename = "mds_demo_iris.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
# PCA
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white')
pca = decomposition.PCA(n_components=3)
Xtrans = pca.fit(X).transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("PCA on Iris data set in 3 dimensions")
ax.view_init(50, -35)
pca = decomposition.PCA(n_components=2)
Xtrans = pca.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("PCA on Iris data set in 2 dimensions")
filename = "pca_demo_iris.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def validate(classifier, train, test, args, report_base_name):
print('\nTraining classifier on %d samples ...' % len(train.X))
start = timeit.default_timer()
classifier.fit(train.X, train.y)
stop = timeit.default_timer()
print('Classifier trained, took %f seconds' % (stop - start))
for method in args.loglikelihood_methods:
report_name = report_base_name + '_' + method
if args.calculate_distances:
print('\nCalculating distances ...')
start = timeit.default_timer()
distances = classifier.distances(loglikelihood_method=method, n_samples=500)
print('Distances calculated, took %f seconds' % (timeit.default_timer() - start))
report = _generate_distance_reports(distances, target_names=train.target_names)
_handle_report(report, report_name + '_distances', args)
# Calculate proto symbol space
#mds = MDS(n_components=5, dissimilarity='precomputed')
#coordinates = mds.fit_transform(distances)
#_plot_proto_symbol_space(coordinates, train.target_names, report_name + '_scatter', args)
# Get loglikelihoods for train set
print('\nValidating classifier on training set with %d samples ...' % len(train.X))
loglikelihoods_train = _calculate_loglikelihoods(classifier, train.X, method)
report = _generate_loglikelihood_reports(loglikelihoods_train, train.y, target_names=train.target_names)
_handle_report(report, report_name + '_train_loglikelihoods', args)
# Fit decision makers
loglikelihoods_test = None
for idx, decision_maker in enumerate(get_decision_makers(args)):
if decision_maker is not None:
name = args.decision_makers[idx]
if hasattr(decision_maker, 'fit') and callable(getattr(decision_maker, 'fit')):
print('\nTraining decision maker %s on %d loglikelihoods ...' % (name, len(loglikelihoods_train)))
decision_maker.fit(loglikelihoods_train, train.y)
print('Decision maker trained, took %f seconds' % (stop - start))
else:
print('\nUsing decision maker %s ...' % name)
y_pred = _calculate_predictions(decision_maker, loglikelihoods_train)
report = _generate_classification_reports(train.y, y_pred, target_names=train.target_names)
_handle_report(report, report_name + '_train_classification_' + name, args)
# Validate on test set
print('\nValidating classifier on test set with %d samples ...' % len(test.X))
if loglikelihoods_test is None:
loglikelihoods_test = _calculate_loglikelihoods(classifier, test.X, method)
report = _generate_loglikelihood_reports(loglikelihoods_test, test.y, target_names=test.target_names)
_handle_report(report, report_name + '_test_loglikelihoods', args)
if decision_maker is not None:
y_pred = _calculate_predictions(decision_maker, loglikelihoods_test)
report = _generate_classification_reports(test.y, y_pred, target_names=test.target_names)
_handle_report(report, report_name + '_test_classification_' + name, args)
document_clustering.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def plot_clusters(num_clusters, feature_matrix,
cluster_data, movie_data,
plot_size=(16,8)):
# generate random color for clusters
def generate_random_color():
color = '#%06x' % random.randint(0, 0xFFFFFF)
return color
# define markers for clusters
markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
# build cosine distance matrix
cosine_distance = 1 - cosine_similarity(feature_matrix)
# dimensionality reduction using MDS
mds = MDS(n_components=2, dissimilarity="precomputed",
random_state=1)
# get coordinates of clusters in new low-dimensional space
plot_positions = mds.fit_transform(cosine_distance)
x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
# build cluster plotting data
cluster_color_map = {}
cluster_name_map = {}
for cluster_num, cluster_details in cluster_data.items():
# assign cluster features to unique label
cluster_color_map[cluster_num] = generate_random_color()
cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
# map each unique cluster label with its coordinates and movies
cluster_plot_frame = pd.DataFrame({'x': x_pos,
'y': y_pos,
'label': movie_data['Cluster'].values.tolist(),
'title': movie_data['Title'].values.tolist()
})
grouped_plot_frame = cluster_plot_frame.groupby('label')
# set plot figure size and axes
fig, ax = plt.subplots(figsize=plot_size)
ax.margins(0.05)
# plot each cluster using co-ordinates and movie titles
for cluster_num, cluster_frame in grouped_plot_frame:
marker = markers[cluster_num] if cluster_num < len(markers) \
else np.random.choice(markers, size=1)[0]
ax.plot(cluster_frame['x'], cluster_frame['y'],
marker=marker, linestyle='', ms=12,
label=cluster_name_map[cluster_num],
color=cluster_color_map[cluster_num], mec='none')
ax.set_aspect('auto')
ax.tick_params(axis= 'x', which='both', bottom='off', top='off',
labelbottom='off')
ax.tick_params(axis= 'y', which='both', left='off', top='off',
labelleft='off')
fontP = FontProperties()
fontP.set_size('small')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True,
shadow=True, ncol=5, numpoints=1, prop=fontP)
#add labels as the film titles
for index in range(len(cluster_plot_frame)):
ax.text(cluster_plot_frame.ix[index]['x'],
cluster_plot_frame.ix[index]['y'],
cluster_plot_frame.ix[index]['title'], size=8)
# show the plot
plt.show()
def visualize_encodings(encodings, file_name=None,
grid=None, skip_every=999, fast=False, fig=None, interactive=False):
encodings = manual_pca(encodings)
if encodings.shape[1] <= 3:
return print_data_only(encodings, file_name, fig=fig, interactive=interactive)
encodings = encodings[0:720]
hessian_euc = dist.squareform(dist.pdist(encodings[0:720], 'euclidean'))
hessian_cos = dist.squareform(dist.pdist(encodings[0:720], 'cosine'))
grid = (3, 4) if grid is None else grid
project_ops = []
n = 2
project_ops.append(("LLE ltsa N:%d" % n, mn.LocallyLinearEmbedding(10, n, method='ltsa')))
project_ops.append(("LLE modified N:%d" % n, mn.LocallyLinearEmbedding(10, n, method='modified')))
project_ops.append(('MDS euclidean N:%d' % n, mn.MDS(n, max_iter=300, n_init=1, dissimilarity='precomputed')))
project_ops.append(("TSNE 30/2000 N:%d" % n, TSNE(perplexity=30, n_components=n, init='pca', n_iter=2000)))
n = 3
project_ops.append(("LLE ltsa N:%d" % n, mn.LocallyLinearEmbedding(10, n, method='ltsa')))
project_ops.append(("LLE modified N:%d" % n, mn.LocallyLinearEmbedding(10, n, method='modified')))
project_ops.append(('MDS euclidean N:%d' % n, mn.MDS(n, max_iter=300, n_init=1, dissimilarity='precomputed')))
project_ops.append(('MDS cosine N:%d' % n, mn.MDS(n, max_iter=300, n_init=1, dissimilarity='precomputed')))
plot_places = []
for i in range(12):
u, v = int(i / (skip_every - 1)), i % (skip_every - 1)
j = v + u * skip_every + 1
plot_places.append(j)
fig = get_figure(fig)
fig.set_size_inches(fig.get_size_inches()[0] * grid[0] / 1.,
fig.get_size_inches()[1] * grid[1] / 2.0)
for i, (name, manifold) in enumerate(project_ops):
is3d = 'N:3' in name
try:
if is3d:
subplot = plt.subplot(grid[0], grid[1], plot_places[i], projection='3d')
else:
subplot = plt.subplot(grid[0], grid[1], plot_places[i])
data_source = encodings if not _needs_hessian(manifold) else \
(hessian_cos if 'cosine' in name else hessian_euc)
projections = manifold.fit_transform(data_source)
scatter(subplot, projections, is3d, _build_radial_colors(len(data_source)))
subplot.set_title(name)
except:
print(name, "Unexpected error: ", sys.exc_info()[0], sys.exc_info()[1] if len(sys.exc_info()) > 1 else '')
visualize_data_same(encodings, grid=grid, places=plot_places[-4:])
if not interactive:
save_fig(file_name, fig)
ut.print_time('visualization finished')
def get_arrangement_permutation(
dist,
mode,
model=None,
clusters=None,
init_perm=None):
start_time = time.time()
if mode == "none":
return [i for i in range(dist.shape[0])]
if mode == "hamilton":
from .hamilton_path import HamiltonPath
hp = HamiltonPath(dist, caller=model)
hp.solve()
perm = hp.path
elif mode == "hamilton_annealing":
from .hamilton_path import HamiltonPath
hp = HamiltonPath(dist, caller=model)
hp.solve_annealing()
perm = hp.path
elif mode == "tsne":
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=1, random_state=0, metric="precomputed")
tsne_result = tsne_model.fit_transform(dist).reshape(-1)
perm = np.argsort(tsne_result)
elif mode == "mds":
from sklearn.manifold import MDS
mds = MDS(
n_components=1,
max_iter=3000,
eps=1e-9,
random_state=0,
dissimilarity="precomputed",
n_jobs=4)
result = mds.fit_transform(dist).reshape(-1)
perm = np.argsort(result)
elif mode == "dendro":
from algo.arranging.dendro_arranger import DendroArranger
da = DendroArranger(dist)
perm = da.arrange()
else:
raise ValueError("Unknown mode: %s" % mode)
if model:
from .quality import NDS, MNR
model.NDS = NDS(dist, perm)
model.log("NDS=%f" % model.NDS)
model.log("MNR=%f" % MNR(dist, perm))
model.log("Time=%f" % (time.time() - start_time))
return perm