def trained_models():
dataset = datasets.load_breast_cancer()
X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
lr = LogisticRegression()
lr.fit(X_train, y_train)
svc_w_linear_kernel = SVC(kernel='linear')
svc_w_linear_kernel.fit(X_train, y_train)
svc_wo_linear_kernel = SVC()
svc_wo_linear_kernel.fit(X_train, y_train)
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel,
'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
python类train_test_split()的实例源码
def train_model_with_cv(model, params, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
# Use Train data to parameter selection in a Grid Search
gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
gs_clf = gs_clf.fit(X_train, y_train)
model = gs_clf.best_estimator_
# Use best model and test data for final evaluation
y_pred = model.predict(X_test)
_f1 = f1_score(y_test, y_pred, average='micro')
_confusion = confusion_matrix(y_test, y_pred)
__precision = precision_score(y_test, y_pred)
_recall = recall_score(y_test, y_pred)
_statistics = {'f1_score': _f1,
'confusion_matrix': _confusion,
'precision': __precision,
'recall': _recall
}
return model, _statistics
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def outlier_identification(self, model, x_train, y_train):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
print('\nOutlier shapes')
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
model.fit(x_train_split, y_train_split)
y_predicted = model.predict(x_test_split)
residuals = np.absolute(y_predicted - y_test_split)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
outliers_mask = residuals >= rmse_pred_vs_actual
outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
not_an_outlier = outliers_mask == 0
# Resample the training set from split, since the set was randomly split
x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
y_test_split, title_name):
# Split the training data into an extra set of test
# x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual y')
plt.ylabel('Predicted y')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def test_calibrate_final_model_classification():
np.random.seed(0)
df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()
# Take a third of our test data (a tenth of our overall data) for calibration
df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test, test_size=0.33, random_state=42)
column_descriptions = {
'survived': 'output'
, 'embarked': 'categorical'
, 'pclass': 'categorical'
}
ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)
ml_predictor.train(df_titanic_train, calibrate_final_model=True, X_test=df_titanic_calibration, y_test=df_titanic_calibration.survived)
test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)
print('test_score')
print(test_score)
assert -0.215 < test_score < -0.17
def get_titanic_binary_classification_dataset(basic=True):
try:
df_titanic = pd.read_csv(os.path.join('tests', 'titanic.csv'))
except Exception as e:
print('Error')
print(e)
dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
df_titanic = pd.read_csv(dataset_url)
# Do not write the index that pandas automatically creates
df_titanic.to_csv(os.path.join('tests', 'titanic.csv'), index=False)
df_titanic = df_titanic.drop(['boat', 'body'], axis=1)
if basic == True:
df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)
df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
return df_titanic_train, df_titanic_test
def get_twitter_sentiment_multilabel_classification_dataset():
file_name = os.path.join('tests', 'twitter_sentiment.csv')
try:
df_twitter = pd.read_csv(open(file_name,'rU'), encoding='utf-8', engine='python')
except Exception as e:
print('Error')
print(e)
dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
df_twitter = pd.read_csv(dataset_url)
# Do not write the index that pandas automatically creates
df_twitter.to_csv(file_name, index=False)
# Grab only 10% of the dataset- runs much faster this way
df_twitter = df_twitter.sample(frac=0.1)
df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)
df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
return df_twitter_train, df_twitter_test
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42):
self.name = name
self.X = X
self.y = y
self.task = task
self.random_state = random_state
if test_size is not None:
self.test_size = test_size
self.validation_method = "train_test_split"
self.X_train, self.X_test, self.y_train, self.y_test = \
model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
elif cv is not None:
self.validation_method = "cv"
if task == "regression":
self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state)
elif task == "classification":
self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
def crate_pre_train_model(x_,y_):
(x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1)
(y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1)
dtrain = xgb.DMatrix( x_train, label=y_train)
dtest = xgb.DMatrix( x_test, label=y_test)
evallist = [(dtrain,'train'),(dtest,'eval')]
param = {'objective':'reg:linear','max_depth':3 }
param['nthread'] = 64
#param['min_child_weight'] = 15
#param['subsample'] = 1
#param['num_class'] = 7
plst = param.items()
num_round = 5000
bst = xgb.train( plst, dtrain, num_round,
evallist,early_stopping_rounds=100,
#obj=logregobj,
feval=evalerror
)
return bst
# %% main
LinearRegression.py 文件源码
项目:software-suite-movie-market-analysis
作者: 93lorenzo
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def readData():
vector = []
labels = []
indice = 0
for elem in gson:
try:
actors = gson.get(elem).get("actors")
directors = gson.get(elem).get("director")
writers = gson.get(elem).get("writer")
imdbRating = int(float(gson.get(elem).get("imdbRating")))
mediaAct, mediaDir, mediaWri = calcolaMedie(actors, directors, writers)
vect = [1,mediaAct, mediaDir, mediaWri]
vector.append(vect)
labels.append(int(imdbRating)) ## CAST PER CLASSI DISCRETE ##
except Exception:
continue
data = np.array(vector)
labels = np.array(labels)
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.4)
return train_data, train_labels, test_data, test_labels
LogisticRegression.py 文件源码
项目:software-suite-movie-market-analysis
作者: 93lorenzo
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def readData(self):
vector = []
labels = []
indice = 0
for elem in gson:
actors = gson.get(elem).get("actors")
directors = gson.get(elem).get("director")
writers = gson.get(elem).get("writer")
imdbRating = int(float(gson.get(elem).get("imdbRating")))
mediaAct, mediaDir, mediaWri = self.calcolaMedie(actors, directors, writers)
vect = [1,mediaAct, mediaDir, mediaWri]
vector.append(vect)
labels.append(int(imdbRating)) ## CAST PER CLASSI DISCRETE ##
data = np.array(vector)
labels = np.array(labels)
train_data,test_data,train_labels,test_labels = train_test_split(data,labels, train_size= 0.1)
return train_data, train_labels,test_data,test_labels
def metrics_equal():
dataset_path = dpu.generate_equal_dataset()
dataset = dpu.load(dataset_path)
mm = SGDCModelManager()
mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42)
mm.train()
predicts = mm.predict(mm.x_test)
report = classification_report(mm.y_test, predicts)
return jsonify(status=200, message=report)
def probabilities_equal():
dataset_path = dpu.generate_equal_dataset()
dataset = dpu.load(dataset_path)
mm = SGDCModelManager()
mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42)
mm.train()
probabilities = mm.probabilities(mm.x_test)
result = []
for i in range(len(mm.y_test)):
result.append({
'probabilities': list(probabilities[i]),
'category': mm.y_test[i]
})
return jsonify(status=200, result=result)
def _preload_files_single_volunteer(dataset_dir, speaker_id, view_id, utterance_types):
all_videos = path.join(_current_path, 'splits/allVideos.txt')
u_list = _gen_utterance_list(utterance_types)
with open(all_videos, 'r') as f:
contents = f.read().splitlines()
video_list = [path.join(dataset_dir, line)
for line in contents
if 's' + str(speaker_id) + '_' in line
if 'v' + str(view_id) in line
if any(u in line for u in u_list)]
from sklearn.model_selection import train_test_split
train, test = train_test_split(video_list, test_size=0.30, random_state=0)
return train, test
def get_dataset(dataset_path='Data/Train_Data'):
# Getting all data from data path:
try:
X = np.load('Data/npy_train_data/X.npy')
Y = np.load('Data/npy_train_data/Y.npy')
except:
labels = listdir(dataset_path) # Geting labels
X = []
Y = []
for label in labels:
datas_path = dataset_path+'/'+label
for data in listdir(datas_path):
img = get_img(datas_path+'/'+data)
X.append(img)
Y.append(int(label))
# Create dateset:
X = np.array(X).astype('float32')/255.
Y = np.array(Y).astype('float32')
Y = to_categorical(Y, 2)
if not os.path.exists('Data/npy_train_data/'):
os.makedirs('Data/npy_train_data/')
np.save('Data/npy_train_data/X.npy', X)
np.save('Data/npy_train_data/Y.npy', Y)
X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
return X, X_test, Y, Y_test
def _preprocess_PBEs(self, PBE_idx=None):
"""used for most types of shuffles"""
# compute PBEs
self.PBEs = self._st.bin(ds=self._ds)
if self.PBEs.n_epochs == 1:
raise ValueError("spike train is continuous, and does not have more than one event!")
if PBE_idx is not None:
self._trainidx, self._testidx = PBE_idx # tuple unpacking
else:
# split into train and test data
if self._random_state is not None:
self._trainidx, self._testidx = train_test_split(np.arange(self.PBEs.n_epochs), test_size=self._test_size, random_state=self._random_state)
else:
self._trainidx, self._testidx = train_test_split(np.arange(self.PBEs.n_epochs), test_size=self._test_size, random_state=1)
self._trainidx.sort()
self._testidx.sort()
self.PBEs_train = self.PBEs[self._trainidx]
self.PBEs_test = self.PBEs[self._testidx]
def load_data():
id2label = {}
label2id = {}
label_path = osp.abspath( osp.join(get_dataset_base(), "uci_yeast", "yeast.label") )
with open(label_path) as f:
for row in f:
cols = row.strip().split(" ")
id2label[int(cols[0])] = cols[1]
label2id[cols[1]] = int(cols[0])
data_path = osp.abspath( osp.join(get_dataset_base(), "uci_yeast", "yeast.data") )
with open(data_path) as f:
rows = f.readlines()
n_datas = len(rows)
X = np.zeros((n_datas, 8), dtype=np.float32)
y = np.zeros(n_datas, dtype=np.int32)
for i, row in enumerate(rows):
cols = re.split(" +", row.strip())
#print(list(map(float, cols[1:1+8])))
X[i,:] = list(map(float, cols[1:1+8]))
y[i] = label2id[cols[-1]]
train_idx, test_idx = train_test_split(range(n_datas), random_state=0, train_size=0.7, stratify=y)
return (X[train_idx], y[train_idx]), (X[test_idx], y[test_idx])
def get_train_data(corpus, **kwargs):
X = []
y = []
documents = corpus.iter_documents()
if count:
documents = islice(documents, count)
for document in tqdm(documents):
try:
text = document.raw()
sents = document.raw_sents()
labels = text2labels(text, sents)
features = sent2features(text)
X.append(features)
y.append(labels)
except Exception as exc:
# TODO:
pass
return train_test_split(X, y, **kwargs)
def get_pos_train_data(corpus, count=None, **kwargs):
X = []
y = []
documents = corpus.iter_documents()
if count:
documents = islice(documents, count)
for document in tqdm(documents):
sents = document.iter_tagged_sents()
for sent in sents:
tokens = []
labels = []
for token, tags in sent:
tags = tags.split(',')
tokens.append(token)
labels.append(tags[0]) # TODO:
X.append(sent2posfeatures(tokens))
y.append(labels)
return train_test_split(X, y, **kwargs)
def get_train_data(corpus, count=None, **kwargs):
X = []
y = []
documents = corpus.iter_documents()
if count:
documents = islice(documents, count)
for document in tqdm(documents):
try:
text = document.raw()
words = document.words()
labels = text2labels(text, words)
features = list(text2features(text))
X.append(features)
y.append(labels)
except Exception as exc:
# TODO:
continue
return train_test_split(X, y, **kwargs)
def train_test_split_per_class(X, y, train_size=None, test_size=None):
sh = np.array(X.shape)
num_classes = len(np.bincount(y))
sh[0] = 0
X_train_arr = np.zeros(sh, dtype=X.dtype)
X_test_arr = np.zeros(sh, dtype=X.dtype)
y_train_arr = np.zeros((0), dtype=y.dtype)
y_test_arr = np.zeros((0), dtype=y.dtype)
for i in range(num_classes):
X_train, X_test, y_train, y_test = train_test_split(X[y==i], y[y==i],
train_size=train_size,
test_size=test_size)
X_train_arr = np.append(X_train_arr, X_train, axis=0)
X_test_arr = np.append(X_test_arr, X_test, axis=0)
y_train_arr = np.append(y_train_arr, y_train)
y_test_arr = np.append(y_test_arr, y_test)
return X_train_arr, X_test_arr, y_train_arr, y_test_arr
def outlier_identification(self, model, x_train, y_train):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
print('\nOutlier shapes')
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
model.fit(x_train_split, y_train_split)
y_predicted = model.predict(x_test_split)
residuals = np.absolute(y_predicted - y_test_split)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
outliers_mask = residuals >= rmse_pred_vs_actual
# outliers_mask = np.insert(np.zeros((np.shape(y_train_split)[0],), dtype=np.int), np.shape(y_train_split)[0],
# outliers_mask)
outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
not_an_outlier = outliers_mask == 0
# Resample the training set from split, since the set was randomly split
x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
0.3, 0.6, 1],
max_iter=50000, cv=10)
# lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
# 0.3, 0.6, 1], cv=10)
lasso.fit(x_train_split, y_train_split)
y_predicted = lasso.predict(X=x_test_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def fit(self, X, y):
if self.use_mspe:
lgb_train = lgb.Dataset(X, y,
weight=np.ones(X.shape[0]),
free_raw_data=False)
lgb_test = lgb.Dataset(X, y, reference=lgb_train,
weight=np.ones(X.shape[0]),
free_raw_data=False)
self.gbm = lgb.train(
self.kwargs,
lgb_train,
num_boost_round=10,
fobj=mspe,
feval=evalerror_lgbm,
valid_sets=lgb_test)
else:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3)
#lgb_test = lgb.Dataset(X, y, reference=lgb_train,
# weight=np.ones(X.shape[0]),
# free_raw_data=False)
self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False)
#print "gbm best_iteration=", self.gbm.best_iteration
def validate_formula(formula, training_data, column_being_predicted, cross_val_n=3, validation_size=.10):
'''
Accept a formula in the StatsModels.formula.api style, some training data and
some test values that must match the value being predicted by the formula.
returns: trained_model, cross_scores
'''
cross_val_scores = []
for _ in xrange(cross_val_n):
X_train, X_test, _, _ = train_test_split(
training_data,
training_data[column_being_predicted],
test_size=validation_size
)
model = smf.ols(formula=formula, data=X_train).fit()
test_values = X_test[column_being_predicted]
score = root_mean_log_squared_error(model, X_test, test_values)
cross_val_scores.append(score)
return (model, cross_val_scores)
def get_titanic_binary_classification_dataset(basic=True):
try:
df_titanic = pd.read_csv(os.path.join('tests', 'titanic.csv'))
except Exception as e:
print('Error')
print(e)
dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
df_titanic = pd.read_csv(dataset_url)
# Do not write the index that pandas automatically creates
df_titanic.to_csv(os.path.join('tests', 'titanic.csv'), index=False)
df_titanic = df_titanic.drop(['boat', 'body'], axis=1)
if basic == True:
df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)
df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
return df_titanic_train, df_titanic_test
def test_predict_uncertainty_returns_dict_for_one_value():
np.random.seed(0)
df_boston_train, df_boston_test = utils.get_boston_regression_dataset()
column_descriptions = {
'MEDV': 'output'
, 'CHAS': 'categorical'
}
df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)
ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)
ml_predictor.train(df_boston_train, perform_feature_selection=True, train_uncertainty_model=True, uncertainty_data=uncertainty_data)
test_list = df_boston_test.to_dict('records')
for item in test_list:
prediction = ml_predictor.predict_uncertainty(item)
assert isinstance(prediction, dict)
def test_score_uncertainty():
np.random.seed(0)
df_boston_train, df_boston_test = utils.get_boston_regression_dataset()
column_descriptions = {
'MEDV': 'output'
, 'CHAS': 'categorical'
}
df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)
ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)
ml_predictor.train(df_boston_train, perform_feature_selection=True, train_uncertainty_model=True, uncertainty_data=uncertainty_data)
uncertainty_score = ml_predictor.score_uncertainty(df_boston_test, df_boston_test.MEDV)
print('uncertainty_score')
print(uncertainty_score)
assert uncertainty_score > -0.2
def get_titanic_binary_classification_dataset(basic=True):
try:
df_titanic = pd.read_csv(os.path.join('tests', 'titanic.csv'))
except Exception as e:
print('Error')
print(e)
dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
df_titanic = pd.read_csv(dataset_url)
# Do not write the index that pandas automatically creates
df_titanic.to_csv(os.path.join('tests', 'titanic.csv'), index=False)
df_titanic = df_titanic.drop(['boat', 'body'], axis=1)
if basic == True:
df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)
df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
return df_titanic_train, df_titanic_test