def kmeans_numpy(d, headers, K, whiten=True):
# assign to A the result of getting the data from your Data object
A = d.get_data(headers)
# assign to W the result of calling vq.whiten on A
W = vq.whiten(A)
# assign to codebook, bookerror the result of calling vq.kmeans with W and K
codebook, bookerror = vq.kmeans(W, K)
# assign to codes, error the result of calling vq.vq with W and the codebook
codes, error = vq.vq(W, codebook)
# return codebook, codes, and error
return codebook, codes, error
# prep the k-means clustering algorithm by getting initial cluster means
python类Data()的实例源码
def normalize_columns_separately(data_obj, column_headers):
final_columns = []
# print column_headers
columns = data_obj.get_data(column_headers).transpose().tolist()
for column in columns:
temp_column = []
max_num = max(column)
min_num = min(column)
for number in column:
number -= min_num
number *= 1 / (max_num - min_num)
temp_column.append(number)
final_columns.append(temp_column)
# print "Normalized matrix"
# print np.matrix(final_columns).transpose()
print "\n\n"
return np.matrix(final_columns).transpose()
# Takes in a list of column headers and the Data object and returns a matrix with each entry normalized so that the
# minimum value (of all the data in this set of columns)
# is mapped to zero and its maximum value is mapped to 1.
def body(self, master):
tk.Checkbutton(master, text="Labels included", variable=self.labels_given).grid(row=0)
self.menu = apply(tk.OptionMenu, (master, self.algorithm) + tuple(["Naive Bayes", "K-Nearest Neighbors"]))
self.menu_label = tk.Label(master, text="Algorithm")
self.menu_label.grid(row=1, column=0)
self.menu.grid(row=1, column=1)
self.training_data_button = tk.Button(master, text="Choose Training Data", command=self.handle_training_button)
self.training_data_button.grid(row=2)
self.testing_data_button = tk.Button(master, text="Choose Testing Data", command=self.handle_testing_button)
self.testing_data_button.grid(row=3)
self.training_label_button = tk.Button(master, text="Choose Training labels",
command=self.handle_training_label_button)
self.training_label_button.config(state="disabled")
self.training_label_button.grid(row=4)
self.testing_label_button = tk.Button(master, text="Choose Testing labels",
command=self.handle_testing_label_button)
self.testing_label_button.config(state="disabled")
self.testing_label_button.grid(row=5)
def load_and_run(args, trainerClass):
start_time = time.time()
seed = int(args.get('--seed', 0))
trainer = load_trainer(args, trainerClass, Data, seed)
train_batch_name = args.get('--train-batch', None) or "train"
validation_batch_name = args.get('--validation-batch', None)
test_batch_name = args.get('--test-batch', None)
print_params = args.get('--print-params', False) or False
print_loss_breakdown = args.get('--print-loss-breakdown', False) or False
num_restarts = int(args.get('--num-restarts', 1))
for i in xrange(num_restarts):
(params, discretized_params) = trainer.train(train_batch_name,
validation_batch_name=validation_batch_name,
test_batch_name=test_batch_name,
print_params=print_params,
print_final_loss_breakdown=print_loss_breakdown)
if '--store-data' in args and args['--store-data'] is not None:
store_results_to_hdf5(args['--store-data'], trainer, train_batch_name, restart_idx=i)
print ("Training stopped after %2.fs." % (time.time() - start_time))
def checkIsFloat(self,x1,x2):
isFloat=False
if(isinstance(x1,Data)):
isFloat=(x1.type.type=='double')
elif(isinstance(x1,str)):
isFloat=(x1=='st')
elif(isinstance(x1,float)):
isFloat=True
else:
pass
if isFloat:
return isFloat
if(isinstance(x2,Data)):
isFloat=(x2.type.type=='double')
elif(isinstance(x2,str)):
isFloat=(x2=='st')
elif(isinstance(x2,float)):
isFloat=True
else:
pass
return isFloat
def call(self,func,parameters=None):
'''
@funcName: function
@parameters: a dict like{parameter1 name: type, parameter2 ...}
'''
# for vName in self.currentMap:
# if(self.currentMap[vName]['reg']=='eax'):
# self.currentMap[vName]['reg']=0
# self.registers['eax']=0
# self.gen.asm.append('\tmov '+self.currentMap[vName]['addr']+', eax\n')
self.callOffset=0
if(isinstance(func,Data)):
if(func.type.type=='function' and func.type.pointer_count()>0):
self.gen.asm.append('\tcall '+self.currentMap[func.name]["addr"]+'\n')
else:
self.gen.asm.append('\tcall '+func.name+'\n')
else:
self.gen.asm.append('\tcall '+func+'\n')
return 'eax'
def gen_primary_expression(self,node,context):
"""
:type node:TreeNode
:type context:Context
:rtype: Data
"""
if isinstance(node[1],TreeNode):
if node[1][0]=="IDENTIFIER":
name=node[1][1]
offset=False
type=deepcopy(context.get_type_by_id(name))
return Data(name,offset,type)
else:
if node[1][0]=="INTEGER":
return int(node[1][1])
elif node[1][0]=="DOUBLE":
return float(node[1][1])
elif node[1][0]=="STRING":
return str(node[1][1])
def main(args):
with tf.device("cpu"):
data = Data(batch_size=args.batch_size, validation_size=6000)
session = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=args.num_threads))
graphs = SharedResource([build_graph(reuse=i > 0) for i in range(args.num_threads)])
session.run(tf.initialize_all_variables())
train_total_time_sum = 0
for epoch in range(args.num_epochs):
train_start_time = time.time()
train_accuracy = accuracy(session, graphs, data.iterate_train(), num_threads=args.num_threads, train=True)
train_total_time = time.time() - train_start_time
train_total_time_sum += train_total_time
validate_accuracy = accuracy(session, graphs, data.iterate_validate(), num_threads=args.num_threads, train=False)
print ("Training epoch number %d:" % (epoch,))
print (" Time to train = %.3f s" % (train_total_time))
print (" Training set accuracy = %.1f %%" % (100.0 * train_accuracy,))
print (" Validation set accuracy = %.1f %%" % (100.0 * validate_accuracy,))
print ("")
print ("Training done.")
test_accuracy = accuracy(session, graphs, data.iterate_test(), num_threads=args.num_threads, train=False)
print (" Average time per training epoch = %.3f s" % (train_total_time_sum / NUM_EPOCHS,))
print (" Test set accuracy = %.1f %%" % (100.0 * test_accuracy,))
def data_range(data_obj, column_headers):
range_list = []
columns = data_obj.get_data(column_headers).transpose().tolist()
for column in columns:
min_max_list = [max(column), min(column)]
range_list.append(min_max_list)
return range_list
# Takes in a list of column headers and the Data object and returns a list of the mean values for each column
def mean(data_obj, column_headers):
mean_list = []
columns = data_obj.get_data(column_headers).transpose().tolist()
for column in columns:
mean_list.append(np.mean(column))
return mean_list
# Takes in a list of column headers and the Data object and returns a list of the standard deviation
# for each specified column
def stdev(data_obj, column_headers):
stdev_list = []
columns = data_obj.get_data(column_headers).transpose().tolist()
for column in columns:
stdev_list.append(np.std(column))
return stdev_list
# Takes in a list of column headers and the Data object and returns a list of the median
# for each specified column
def median(data_obj, column_headers):
median_list = []
columns = data_obj.get_data(column_headers).tolist()
for column in columns:
median_list.append(np.median(column))
return median_list
# Takes in a list of column headers and the Data object and returns a matrix with each column normalized
# so its minimum value is mapped to zero and its maximum value is mapped to 1
def kmeans(d, headers, K, metric, whiten=True, categories=None):
'''Takes in a Data object, a set of headers, and the number of clusters to create
Computes and returns the codebook, codes and representation errors.
If given an Nx1 matrix of categories, it uses the category labels
to calculate the initial cluster means.
'''
# assign to A the result getting the data given the headers
try:
A = d.get_data(headers)
except AttributeError:
A = d
if whiten:
W = vq.whiten(A)
else:
W = A
codebook = kmeans_init(W, K, categories)
# assign to codebook, codes, errors, the result of calling kmeans_algorithm with W and codebook
codebook, codes, errors = kmeans_algorithm(W, codebook, metric)
# return the codebook, codes, and representation error
return codebook, codes, errors
# test function
def test_lin_reg(filename, ind, dep):
b, sse, r2, t, p = linear_regression(data.Data(filename), ind, dep)
savefilename = filename.split('.')[0] + "-analysis"
save_analysis(savefilename, filename, b, sse, r2, t, p, ind, dep)
def handleOpen(self, event=None):
fn = tkFileDialog.askopenfilename(parent=self.root, title="Choose a Data file", initialdir='.')
if fn.split('.')[1] != "csv" and fn.split('.')[1] != "xls":
tkMessageBox.showwarning("Open File", "Cannot open this file\n(%s)" % fn)
return
self.data = dt.Data(filename=fn)
self.handlePlotData()
self.filename = fn
# allows users to choose which features from the data are to be displayed
def setTransformationParameters(self):
print 'handling Data Transformation stuff'
dialog = TPDialog(self.root, "Choose Transformation Parameters", self.scaling_speed, self.pan_speed,
self.rotation_speed)
if dialog.result is not None:
self.scaling_speed = max(1, min(dialog.result[0], 10))
self.pan_speed = max(1, min(dialog.result[1], 10))
self.rotation_speed = max(1, min(dialog.result[2], 10))
def read_training_data(training_file, training_labels_file=None):
if training_labels_file is None:
d = data.Data(training_file)
training_cats = d.get_data([d.get_headers()[-1]])
training_data = d.get_data(d.get_headers()[:-1])
else:
d = data.Data(training_file)
l = data.Data(training_labels_file)
training_cats = l.get_data(l.get_headers())
training_data = d.get_data(d.get_headers())
return training_data, training_cats, d
def read_testing_data(testing_file, testing_labels_file=None):
if testing_labels_file is None:
d = data.Data(testing_file)
testing_cats = d.get_data([d.get_headers()[-1]])
testing_data = d.get_data(d.get_headers()[:-1])
else:
d = data.Data(testing_file)
l = data.Data(testing_labels_file)
testing_cats = l.get_data(l.get_headers())
testing_data = d.get_data(d.get_headers())
return testing_data, testing_cats, d
def test(result_filename, model_filename, data_filename, test_batches=None):
(hypers, params) = load_result(result_filename)
# This is the nasty bit:
# It generates a function containing the model with hardcoded params,
# Var() represented as a simple wrapper object that holds data,
# and a single argument that is used to get input/set output.
# We use this by compiling the function and evaling it in our context,
# and then calling into it once per input/output pair.
(fun_name, runnable_model) = instantiate_model(model_filename,
hypers,
params)
eval(compile(runnable_model, '<generated>', 'exec'))
# Get the data:
data = Data(data_filename)
if test_batches is None:
test_batches = data.get_batch_names()
elif isinstance(test_batches, str):
test_batches = test_batches.split(',')
correct_instances = 0
total_instances = 0
for batch_name in test_batches:
_, batch_data = data.get_batch(batch_name)
ex_idx = 0
for data_instance in batch_data:
print("Testing on batch %s (example %i)" % (batch_name, ex_idx))
ex_idx += 1
total_instances += 1
runtime_data = TerpreTRuntime(data_instance)
eval("%s(runtime_data)" % fun_name)
test_correct = runtime_data.check()
if test_correct:
correct_instances += 1
acc = correct_instances / float(total_instances) * 100.0
print("Test accuracy: %i/%i (%6.2f%%) correct." % (correct_instances,
total_instances,
acc))
def save_data(self, preprocess, max_len, qid, q1, q2, label):
if preprocess:
q1 = preprocess_sentence(q1, max_len)
q2 = preprocess_sentence(q2, max_len)
# This is a non-duplicate sentence -> dissimilar
if label == '0':
self._non_sim_data.append(Data(qid, q1, q2, label, [0, 1]))
# This is a duplicate sentence -> similar
else:
self._sim_data.append(Data(qid, q1, q2, label, [1, 0]))
def refresh_results_display(self):
network_data = Data().network_info
vert_sb = ttk.Scrollbar(self.mainframe, orient=tk.VERTICAL)
horz_sb = ttk.Scrollbar(self.mainframe, orient=tk.HORIZONTAL)
self.results_display = self.multi_column_listbox(ColumnSelect.column_names)
self.fill_multi_column_listbox(self.results_display, network_data)
self.results_display.grid(row=0, column=0, in_=self.mainframe,
sticky='NSEW')
self.results_display.configure(yscrollcommand=vert_sb.set,
xscrollcommand=horz_sb.set)
vert_sb.grid(row=0, column=1, sticky="NS")
vert_sb.config(command=self.results_display.yview)
horz_sb.grid(row=1, column=0, sticky="EW")
horz_sb.config(command=self.results_display.xview)
self.results_display['displaycolumns'] = ColumnSelect.columns_shown
def And(self,x1,x2):
if(isinstance(x1,Data)):
y1addr=self.getAbsoluteAdd(x1)
y1=x1.name
if(isinstance(x2,Data)):
y2addr=self.getAbsoluteAdd(x2)
y2=x2.name
#x2 is not a imm
if(x2=='eax'):
tmp=y1
y1=y2
y2=tmp
tmp=y1addr
y1addr=y2addr
y2addr=tmp
if(y1 in self.currentMap and y2 in self.currentMap):
self.gen.asm.append("\tmov eax, "+y1addr+'\n')
self.gen.asm.append("\tand eax, "+y2addr+'\n')
elif(isinstance(y2,str)):
if(y2 in self.currentMap):
self.gen.asm.append('\tand eax, '+y2addr+'\n')
else:
self.gen.asm.append('\tand eax, '+y2+'\n')
else:
if(y2 in self.currentMap):
self.gen.asm.append("\tmov eax, "+y1addr+'\n')
self.gen.asm.append("\tand eax, "+str(y2)+'\n')
return 'eax'
def Or(self,x1,x2):
if(isinstance(x1,Data)):
y1addr=self.getAbsoluteAdd(x1)
y1=x1.name
if(isinstance(x2,Data)):
y2addr=self.getAbsoluteAdd(x2)
y2=x2.name
#x2 is not a imm
if(x2=='eax'):
tmp=y1
y1=y2
y2=tmp
tmp=y1addr
y1addr=y2addr
y2addr=tmp
if(y1 in self.currentMap and y2 in self.currentMap):
self.gen.asm.append("\tmov eax, "+y1addr+'\n')
self.gen.asm.append("\tor eax, "+y2addr+'\n')
elif(isinstance(y2,str)):
if(y2 in self.currentMap):
self.gen.asm.append('\tor eax, '+y2addr+'\n')
else:
self.gen.asm.append('\tor eax, '+y2+'\n')
else:
if(y2 in self.currentMap):
self.gen.asm.append("\tmov eax, "+y1addr+'\n')
self.gen.asm.append("\tor eax, "+str(y2)+'\n')
return 'eax'
def lea(self,x):
if(isinstance(x,Data)):
xaddr=self.getAbsoluteAdd(x)
if(x in self.registers):
self.gen.asm.append('\tlea '+'eax, '+'['+x+']'+'\n')
else:
self.gen.asm.append('\tlea '+'eax, '+xaddr+'\n')
return 'eax'
def cmp(self,x1,x2):
if(isinstance(x1,Data) and isinstance(x2,Data)):
x1addr=self.getAbsoluteAdd(x1)
x2addr=self.getAbsoluteAdd(x2)
self.gen.asm.append("\tmov eax, "+x1addr+'\n')
self.gen.asm.append('\tcmp '+'eax'+', '+x2addr+'\n')
return
dataflag1=False;
if(isinstance(x1,Data)):
x1=self.getAbsoluteAdd(x1)
dataflag1=True
dataflag2=False;
if(isinstance(x2,Data)):
x2=self.getAbsoluteAdd(x2)
dataflag2=True
# if(isinstance(x1,Data)):
# x1addr=self.getAbsoluteAdd(x1)
# x1=x1.name
# if(isinstance(x2,Data)):
# x2addr=self.getAbsoluteAdd(x2)
# x2=x2.name
# if(x1 in self.currentMap and x2 in self.currentMap):
# self.gen.asm.append("\tmov eax, "+x1addr+'\n')
# self.gen.asm.append('\tcmp '+'eax'+', '+x2+'\n')
# return
if(dataflag1):
self.gen.asm.append('\tcmp DWORD PTR '+str(x1)+', '+str(x2)+'\n')
return
if(dataflag2):
self.gen.asm.append('\tcmp '+str(x1)+', DWORD PTR '+str(x2)+'\n')
return
self.gen.asm.append('\tcmp '+str(x1)+', '+str(x2)+'\n')
return
def sal(self,x,offset):
if(isinstance(x,Data)):
x=self.getAbsoluteAdd(x)
self.gen.asm.append('\tsal '+x+str(offset)+'\n')
return
def sar(self,x,offset):
if(isinstance(x,Data)):
x=self.getAbsoluteAdd(x)
self.gen.asm.append('\tsar '+x+str(offset)+'\n')
return
def gen_unary_expression(self,node,context):
"""
:type node:TreeNode
:type context:Context
:rtype: str
"""
operand=self.expression_handler[node[2][0]](node[2],context)
if isinstance(node[1],TreeNode):
operator=self.gen_unary_operator(node[1],context)
if operator=="&":
if isinstance(operand,Data):
ret=self.tools.lea(operand)
operand.type.is_const.append(False)
return ret
elif operator=="*":
if isinstance(operand,Data):
self.tools.mov(self.tools.getEax(),operand)
operand.name=self.tools.getNull()
operand.offset=True
operand.type.is_const.pop()
return operand
else:
if node[1]=="++":
ret=self.tools.add(operand,1)
self.tools.mov(operand,ret)
return operand
elif node[1]=="--":
self.tools.sub(operand,1)
return operand
def buildControls(self):
### Control ###
# make a control frame on the right
self.rightcntlframe = tk.Frame(self.root)
self.rightcntlframe.pack(side=tk.RIGHT, padx=2, pady=2, fill=tk.Y)
# make a separator frame
sep = tk.Frame(self.root, height=self.initDy, width=2, bd=1, relief=tk.SUNKEN)
sep.pack(side=tk.RIGHT, padx=2, pady=2, fill=tk.Y)
# use a label to set the size of the right panel
label = tk.Label(self.rightcntlframe, text="Control Panel", width=20)
label.pack(side=tk.TOP, pady=10)
# # make a menubutton
# self.colorOption = tk.StringVar(self.root)
# self.colorOption.set("black")
# colorMenu = tk.OptionMenu(rightcntlframe, self.colorOption,
# "black", "blue", "red", "green") # can add a command to the menu
# colorMenu.pack(side=tk.TOP)
#
# # make a button in the frame
# # and tell it to call the handleButton method when it is pressed.
# button = tk.Button(rightcntlframe, text="Update Color",
# command=self.handleButton1)
# button.pack(side=tk.TOP) # default side is top
#
# # make a button in the frame to generate random data points on the canvas
# button2 = tk.Button(rightcntlframe, text="Generate Random", command=self.generateRandomData)
# button2.pack(side=tk.TOP)
#
# # make a button in the frame to let the user choose what sort of probability distribution to use
# button3 = tk.Button(rightcntlframe, text="Choose Distributions", command=self.chooseDist)
# button3.pack(side=tk.TOP)
#
# # make a widget to allow users to specify the number of random points to be generated.
# label3 = tk.Label(rightcntlframe, text="Number of Data Points", width=20)
# label3.pack(side=tk.BOTTOM, pady=10)
#
# entry = tk.Entry(rightcntlframe, textvariable=self.num_pts)
# entry.pack(side=tk.BOTTOM)
return
# Lets users open the csv files which they want to analyze.
def main(argv):
if len(argv) < 4:
print 'Usage: python %s <classification method> <train data file> <test data file> <optional train categories> <optional test categories>' % (argv[0])
exit(-1)
elif len(argv) > 4:
print "Reading data..."
training_data,training_labels, dOb_train = read_training_data(argv[2],argv[4])
testing_data, testing_labels, dObj_test = read_testing_data(argv[3],argv[5])
else:
training_data, training_labels, dOb_train = read_training_data(argv[2])
testing_data, testing_labels, dObj_test = read_testing_data(argv[3])
print "Building the Classifier..."
classifier = build_classifier(training_data,training_labels,argv[1])
print "Classifying test and training data..."
ctraincats, ctrainlabels = classifier.classify(training_data)
ctestcats, ctestlabels = classifier.classify(testing_data)
# recast labels to [0-C-1]
unique1, mapping1 = np.unique(training_labels.T.tolist()[0],return_inverse=True)
unique2, mapping2 = np.unique(testing_labels.T.tolist()[0], return_inverse=True)
mapping1 = np.matrix(mapping1).T
mapping2 = np.matrix(mapping2).T
print "Constructing the Confusion matrices"
cmtx_train = classifier.confusion_matrix(mapping1,ctraincats)
cmtx_test = classifier.confusion_matrix(mapping2,ctestcats)
print cmtx_train
print cmtx_test
print "\nTraining Data"
print classifier.confusion_matrix_str(cmtx_train)
print "\nTesting Data"
print classifier.confusion_matrix_str(cmtx_test)
print "Writing to file"
dObj_test.add_column("class","numeric",ctestcats.T.tolist()[0])
dObj_test.write_to_file(argv[3].split('.')[0] + "-" + argv[1] + "-classified",dObj_test.get_headers())