def get_twitter_sentiment_multilabel_classification_dataset():
file_name = os.path.join('tests', 'twitter_sentiment.csv')
try:
df_twitter = pd.read_csv(open(file_name,'rU'), encoding='latin-1', engine='python')
except Exception as e:
print('Error')
print(e)
dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
# Do not write the index that pandas automatically creates
df_twitter.to_csv(file_name, index=False, encoding='latin-1')
# Grab only 10% of the dataset- runs much faster this way
df_twitter = df_twitter.sample(frac=0.1)
df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)
df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
return df_twitter_train, df_twitter_test
python类train_test_split()的实例源码
def get_titanic_binary_classification_dataset(basic=True):
dir_name = os.path.abspath(os.path.dirname(__file__))
file_name = os.path.join(dir_name, 'titanic.csv')
print('file_name')
print(file_name)
print('dir_name')
print(dir_name)
try:
df_titanic = pd.read_csv(file_name)
except Exception as e:
print('Error')
print(e)
dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
df_titanic = pd.read_csv(dataset_url)
# Do not write the index that pandas automatically creates
df_titanic.to_csv(file_name, index=False)
df_titanic = df_titanic.drop(['boat', 'body'], axis=1)
if basic == True:
df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)
df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
return df_titanic_train, df_titanic_test
def get_twitter_sentiment_multilabel_classification_dataset():
file_name = os.path.join('tests', 'twitter_sentiment.h5')
try:
df_twitter = pd.read_hdf(file_name)
except Exception as e:
print('Error')
print(e)
dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
# Do not write the index that pandas automatically creates
df_twitter.to_hdf(file_name, key='df', format='fixed')
# Grab only 10% of the dataset- runs much faster this way
df_twitter = df_twitter.sample(frac=0.1)
df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)
df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
return df_twitter_train, df_twitter_test
Adaboost.py 文件源码
项目:Machine-Learning-Tools-on-Iris-Dataset
作者: debjitpaul
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_data(iris):
# Only petal length and petal width considered
X = iris.data[:, [2, 3]]
y = iris.target
# Place the iris data into a pandas dataframe
iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])
# View the data
print(iris_df.head())
# Print the classes of the dataset
print('\n' + 'The classes in this data are ' + str(np.unique(y)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
print('Training set are {} samples and Test set are {} samples'.format(
X_train.shape[0], X_test.shape[0]))
print()
return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale the training data before training
Random_forest.py 文件源码
项目:Machine-Learning-Tools-on-Iris-Dataset
作者: debjitpaul
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def get_data(iris):
# Only petal length and petal width considered
X = iris.data[:, [2, 3]]
y = iris.target
# Place the iris data into a pandas dataframe
iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])
# View the data
print(iris_df.head())
# Print the classes of the dataset
print('\n' + 'The classes in this data are ' + str(np.unique(y)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
print('Training set are {} samples and Test set are {} samples'.format(
X_train.shape[0], X_test.shape[0]))
print()
return(X_train, X_test, y_train, y_test,iris_df, X,y)
#scale training data before training
logistic.py 文件源码
项目:Machine-Learning-Tools-on-Iris-Dataset
作者: debjitpaul
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def get_data(iris):
# Only petal length and petal width considered
X = iris.data[:, [2, 3]]
y = iris.target
# Place the iris data into a pandas dataframe
iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])
# View the data
print(iris_df.head())
# Print the classes of the dataset
print('\n' + 'The classes in this data are ' + str(np.unique(y)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
print('Training set are {} samples and Test set are {} samples'.format(
X_train.shape[0], X_test.shape[0]))
print()
return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale the training data before training
svm_oop.py 文件源码
项目:Machine-Learning-Tools-on-Iris-Dataset
作者: debjitpaul
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def get_data(iris):
# Only petal length and petal width considered
X = iris.data[:, [2, 3]]
y = iris.target
# Place the iris data into a pandas dataframe
iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])
# View the data
print(iris_df.head())
# Print the classes of the dataset
print('\n' + 'The classes in this data are ' + str(np.unique(y)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
print('Training set are {} samples and Test set are {} samples'.format(
X_train.shape[0], X_test.shape[0]))
print()
return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale data before training it
AIserver.py 文件源码
项目:Using-machine-learning-to-detect-malicious-URLs
作者: faizann24
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def TL():
allurls = './data/data.csv' #path to our all urls file
allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
allurlsdata = pd.DataFrame(allurlscsv) #converting to a dataframe
allurlsdata = np.array(allurlsdata) #converting it into an array
random.shuffle(allurlsdata) #shuffling
y = [d[1] for d in allurlsdata] #all labels
corpus = [d[0] for d in allurlsdata] #all urls corresponding to a label (either good or bad)
vectorizer = TfidfVectorizer(tokenizer=getTokens) #get a vector for each url but use our customized tokenizer
X = vectorizer.fit_transform(corpus) #get the X vector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #split into training and testing set 80/20 ratio
lgs = LogisticRegression() #using logistic regression
lgs.fit(X_train, y_train)
print(lgs.score(X_test, y_test)) #pring the score. It comes out to be 98%
return vectorizer, lgs
def de_lda(X,y):
""" lda """
dim = X.shape[1]
de = min(2000,dim)
clf = LDA(n_components = de)
_,x_mini,_,y_mini = train_test_split(X,y,test_size = 0.33)
clf.fit(x_mini,y_mini)
def _func(X1,X2):
return clf.transform(X1), clf.transform(X2)
return _func
# def de_ps(X,y):
# """ pearsonr method """
# dim = X.shape[1]
# de = min(2000,dim)
# clf = SelectKBest(Pearsonr , k=de)
# clf.fit(X,y)
# def _func(X1,X2):
# return clf.transform(X1),clf.transform(X2)
# return _func
def prepare_train_data(self):
texts,labels = load_corpus()
volcabulary, train_words = get_volcabulary_and_list_words(texts)
self.set_volcabulary(volcabulary)
del volcabulary,texts
words_index = self.get_word_index(train_words, self.volcabulary, self.max_words, self.max_length)
# del reviews_words, volcabulary
index = np.arange(words_index.shape[0])
train_index, valid_index = train_test_split(
index, train_size=0.8, random_state=520)
train_data = words_index[train_index]
valid_data = words_index[valid_index]
labels = np.asarray(labels)
train_labels = labels[train_index]
valid_labels = labels[valid_index]
print(train_data.shape)
print(valid_data.shape)
pickle.dump((words_index, labels), open("output/zh_comments.pkl", 'wb'))
return train_data, train_labels, valid_data, valid_labels
def get_train_test_sets(X, y):
""" Split X and y into a train and a test sets.
Args:
X: the TF-IDF matrix where each line represents a document and each
column represents a word, typically obtained by running
transform_text() from the TP2.
y: a binary vector where the i-th value indicates whether the i-th is a
spam or a ham.
Returns:
X_train: train subset of X
X_test: test subset of X
y_train: train subset of y
y_test: test subset of y
"""
return train_test_split(X, y)
# Ex4.2, 4.3, 4.4
def GDBT_regression(X=train_df_munged,Y=label_df['SalePrice']):
est = GradientBoostingRegressor(n_estimators=50,max_depth=3,learning_rate=0.1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
est.fit(X_train,Y_train)
y_train_pred = est.predict(X_test)
plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data')
plt.title("Linear regression with GDBT")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
plt.show()
# Plot predictions
plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data")
plt.title("Linear regression with GDBT")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc="upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
plt.show()
print('rmse value:',rmse(Y_test,y_train_pred))
return est
Stock_Prediction_Model_Random_Forrest.py 文件源码
项目:StockRecommendSystem
作者: doncat99
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def best_window(self, X_train, y_train, w_min, w_max, t_min,t_max,f_min,f_max):
w_opt = 0
t_opt = 0
f_opt = 0
accur_opt = 0.
x_w = []
y_accu= []
# range of window : w_min --> w_max
for w in range(w_min,w_max+1):
#X,y = preprocess_data(w)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
t, f, accur = self.best_forrest(X_train,y_train,10,t_min,t_max,f_min,f_max)
print('Window = '+str(w)+' days --> Best Forrest : number of trees : ' + str(t) + ', maximum of features : ' + str(f) + ', with accuracy :' + str(accur))
if (accur > accur_opt) : w_opt, t_opt, f_opt, accur_opt = w, t, f, accur
x_w.append(w), y_accu.append(accur)
print('Best window : w = '+str(w_opt)+'. Best Forrest : number of trees : ' + str(t_opt) + ', maximum of features : ' + str(f_opt) + ', with accuracy :' + str(accur_opt))
return w_opt, t_opt, f_opt
Stock_Prediction_Model_Random_Forrest.py 文件源码
项目:StockRecommendSystem
作者: doncat99
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def prepare_train_test_data(self, data_feature, LabelColumnName):
firstloop = 1
for ticker, data in data_feature.items():
X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False)
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.3)
# print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape)
# print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape)
if firstloop == 1:
firstloop = 0
X_train = X_train_temp
X_test = X_test_temp
y_train = y_train_temp
y_test = y_test_temp
else:
X_train = np.append(X_train, X_train_temp, 0)
X_test = np.append(X_test, X_test_temp, 0)
y_train = np.append(y_train, y_train_temp, 0)
y_test = np.append(y_test, y_test_temp, 0)
#print('Train shape X:', X_train.shape, ',y:', y_train.shape)
#print('Test shape X:', X_test.shape, ',y:', y_test.shape)
return X_train, y_train, X_test, y_test
Stock_Prediction_Model_DBN.py 文件源码
项目:StockRecommendSystem
作者: doncat99
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def prepare_train_test_data(self, data_feature, LabelColumnName):
firstloop = 1
for ticker, data in data_feature.items():
X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False)
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2)
if firstloop == 1:
firstloop = 0
X_train = X_train_temp
X_test = X_test_temp
y_train = y_train_temp
y_test = y_test_temp
else:
X_train = np.append(X_train, X_train_temp, 0)
X_test = np.append(X_test, X_test_temp, 0)
y_train = np.append(y_train, y_train_temp, 0)
y_test = np.append(y_test, y_test_temp, 0)
return X_train, y_train, X_test, y_test
Stock_Prediction_Recommand_System.py 文件源码
项目:StockRecommendSystem
作者: doncat99
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def prepare_train_test_data(self, data_feature, LabelColumnName):
firstloop = 1
for ticker, data in data_feature.items():
X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False, array_format=False)
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2)
# print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape)
# print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape)
if firstloop == 1:
firstloop = 0
X_train = X_train_temp
X_test = X_test_temp
y_train = y_train_temp
y_test = y_test_temp
else:
X_train.append(X_train_temp, ignore_index=True)
X_test.append(X_test_temp, ignore_index=True)
y_train = np.append(y_train, y_train_temp, 0)
y_test = np.append(y_test, y_test_temp, 0)
# print('Train shape X:', X_train.shape, ',y:', y_train.shape)
# print('Test shape X:', X_test.shape, ',y:', y_test.shape)
return X_train, y_train, X_test, y_test
def return_train_dataset(self):
"""Returns train data set
Returns:
X (numpy.ndarray): Features
y (numpy.ndarray): Labels
"""
X, y = self.return_main_dataset()
if self.test_dataset['method'] == 'split_from_main':
X, X_test, y, y_test = train_test_split(
X,
y,
test_size=self.test_dataset['split_ratio'],
random_state=self.test_dataset['split_seed'],
stratify=y
)
return X, y
def gen_train_data(dataset_paths):
X_fs = []
Y_fs = []
for path in dataset_paths:
images, gts, densities = load_images_and_gts(path)
X_fs += images
Y_fs += densities
from sklearn.model_selection import train_test_split
X_fs_train, X_fs_test, Y_fs_train, Y_fs_test = train_test_split(X_fs, Y_fs, test_size = 0.2)
X_train, Y_train = X_fs_train, Y_fs_train
X_test, Y_test = X_fs_test, Y_fs_test
print(len(X_train))
X_train, Y_train = multiscale_pyramidal(X_train, Y_train)
#X_train, Y_train = adapt_images_and_densities(X_train, Y_train, slice_w, slice_h)
print(len(X_train))
X_train, Y_train = generate_slices(X_train, Y_train, slice_w = patch_w, slice_h = patch_h, offset = 8)
print(len(X_train))
#X_train, Y_train = crop_slices(X_train, Y_train)
X_train, Y_train = flip_slices(X_train, Y_train)
print(len(X_train))
X_train, Y_train = samples_distribution(X_train,Y_train)
print(len(X_train))
X_train,Y_train = shuffle_slices(X_train, Y_train)
return X_train, Y_train
def main(unused_argv):
# Get the data.
data_train = np.loadtxt(os.path.join(FLAGS.input_directory,'train'), delimiter=',')
data_test = np.loadtxt(os.path.join(FLAGS.input_directory, 'test'), delimiter=',')
X_train, X_val, y_train, y_val = train_test_split(data_train[:,1:], data_train[:,0].astype(np.int32),
test_size=FLAGS.validation_ratio,
random_state=100)
X_test = data_test[:, 1:]
y_test = data_test[:, 0].astype(np.int32)
# Convert to Examples and write the result to TFRecords.
convert_to((X_train, y_train), PREFIX + '_train')
convert_to((X_val, y_val), PREFIX + '_validation')
convert_to((X_test, y_test), PREFIX + '_test')
def classification():
# Generate a random binary classification problem.
X, y = make_classification(n_samples=350, n_features=15, n_informative=10,
random_state=1111, n_classes=2,
class_sep=1., n_redundant=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
random_state=1111)
model = GradientBoostingClassifier(n_estimators=50, max_depth=4,
max_features=8, learning_rate=0.1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(predictions)
print(predictions.min())
print(predictions.max())
print('classification, roc auc score: %s'
% roc_auc_score(y_test, predictions))