def outlier_prediction(x_train, y_train):
# Use built-in isolation forest or use predicted vs. actual
# Compute squared residuals of every point
# Make a threshold criteria for inclusion
# The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
rng = np.random.RandomState(42)
clf_all_features = IsolationForest(max_samples=100, random_state=rng)
clf_all_features.fit(x_train)
# Predict if a particular sample is an outlier using all features for higher dimensional data set.
y_pred_train = clf_all_features.predict(x_train)
# Exclude suggested outlier samples for improvement of prediction power/score
outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train))
x_train_modified = x_train[outlier_map_out_train, ]
y_train_modified = y_train[outlier_map_out_train, ]
return x_train_modified, y_train_modified
python类IsolationForest()的实例源码
def updateWindow(window,buf,maxContainSize):
if len(buf) >= maxContainSize:#??buf??
print "buffer full "
window = clusteringReminMost(window)
print "window size after clustering without adding buffer :",len(window)
for i in buf:
window.append(i)
ilf = IsolationForest(n_estimators=60)
ilf.fit(window)
print "isolation update finished"
else: #???????buf????
print "higher than threads"
for i in buf:
window.append(i)
ilf = IsolationForest(n_estimators=60)
ilf.fit(window)
print "isolation update finished"
return window,ilf
def init(idlist,d,dblack,outcome,winsize=200,sleeptime = 5):
#????
window = []
while True:
print "fetching at %s" %ctime()
data = getdata()
loadvalue(data, d,dblack)
outvalue = extract(d,idlist)
window.append(outvalue)
if len(window) > winsize:
break
sleep(sleeptime)
#?????
ilf = IsolationForest(n_estimators=60)
ilf.fit(window)
print ilf.predict(window)
for i in ilf.predict(window):
outcome.append(i)
#??
return ilf,window
onlinedetectWithlittleData.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def updateWindow(window,buf,maxContainSize):
if len(buf) >= maxContainSize:#??buf??
print "buffer full "
window = clusteringReminMost(window)
print "window size after clustering without adding buffer :",len(window)
for i in buf:
window = window.append(i)
ilf = IsolationForest(n_estimators=100,verbose=2,)
ilf.fit(window)
print "isolation update finished"
else: #???????buf????
print "higher than threads"
for i in buf:
window = window.append(i)
ilf = IsolationForest(n_estimators=100,verbose=2,)
ilf.fit(window)
print "isolation update finished"
return window,ilf
birchForChangeWindowSize.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def updateWindow(window,buf,maxContainSize):
if len(buf) >= maxContainSize:#??buf??
print window################################################
print "buffer full "
window = clusteringReminMost(window)
print "window size after clustering without adding buffer :",len(window)
for i in buf:
window.append(i)
#print i
ilf = IsolationForest(n_estimators=100)
ilf.fit(window)
print "isolation update finished"
else: #???????buf????
print "higher than threads"
for i in buf:
window.append(i)
ilf = IsolationForest(n_estimators=100)
ilf.fit(window)
print "isolation update finished"
return window,ilf
def updateWindow(window,buf,maxContainSize):
if len(buf) >= maxContainSize:#??buf??
print "buffer full "
window = clusteringReminMost(window)
print "window size after clustering without adding buffer :",len(window)
for i in buf:
window.append(i)
ilf = IsolationForest(n_estimators=100,verbose=2,)
ilf.fit(window)
print "isolation update finished"
else: #???????buf????
print "higher than threads"
for i in buf:
window.append(i)
ilf = IsolationForest(n_estimators=100,verbose=2,)
ilf.fit(window)
print "isolation update finished"
return window,ilf
def updateWindow(window,buf,maxContainSize):
if len(buf) >= maxContainSize:#??buf??
print "buffer full "
for i in buf:
window.append(i)
ilf = IsolationForest(n_estimators=100,contamination=0.01)
ilf.fit(window)
print "isolation update finished"
else: #???????buf????
print "higher than threads"
for i in buf:
window.append(i)
ilf = IsolationForest(n_estimators=100,contamination=0.01)
ilf.fit(window)
print "isolation update finished"
return window,ilf
def init(idlist,d,dblack,outcome,winsize=200,sleeptime = 5):
#????
window = []
while True:
print "fetching at %s" %ctime()
data = getdata()
loadvalue(data, d,dblack)
outvalue = extract(d,idlist)
window.append(outvalue)
if len(window) > winsize:
break
sleep(sleeptime)
#?????
ilf = IsolationForest(n_estimators=100,contamination=0.01)
ilf.fit(window)
print ilf.predict(window)
for i in ilf.predict(window):
outcome.append(i)
#??
return ilf,window
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont):
ilf = IsolationForest(n_estimators=100, contamination=cont)
query = 'select * from ganglia where w_fs >0 and w_namenode>0 and w_rpc >0 limit 1024;' # ???? ???
client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')
result = client.query(query, chunked=False)
data = result['ganglia']
d_sys = data[l_sys]
d_namenode = data[l_namenode]
d_FS = data[l_FS]
d_RPC = data[l_RPC]
ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
ilf_sys.fit(d_sys)
ilf_namenode.fit(d_namenode)
ilf_FS.fit(d_FS)
ilf_RPC.fit(d_RPC)
print "update finished"
return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
def transform(self, X, **transform_params):
if X.shape[0] < 1/self.contamination:
return X
self.isolation_forest = IsolationForest(contamination=self.contamination,
n_estimators=self.n_estimators,
n_jobs=self.n_jobs)
to_analyze = self._columns_to_apply(X)
if to_analyze is None:
to_analyze = self._numeric_columns(X)
rest = self._rest_columns(X, to_analyze)
self.isolation_forest.fit(to_analyze)
labels = self.isolation_forest.predict(to_analyze)
to_analyze['_outlier'] = labels; to_analyze = to_analyze[to_analyze['_outlier'] == 1];
del(to_analyze['_outlier'])
rest['_outlier'] = labels; rest = rest[rest['_outlier'] == 1]; del(rest['_outlier'])
if self.verbose:
print('%s Now has %s' % (self.class_name, to_analyze.shape[0]))
return pd.concat((to_analyze, rest), axis=1)
def test_iforest_error():
"""Test that it gives proper exception on deficient input."""
X = iris.data
# Test max_samples
assert_raises(ValueError,
IsolationForest(max_samples=-1).fit, X)
assert_raises(ValueError,
IsolationForest(max_samples=0.0).fit, X)
assert_raises(ValueError,
IsolationForest(max_samples=2.0).fit, X)
# The dataset has less than 256 samples, explicitly setting max_samples > n_samples
# should result in a warning. If not set explicitly there should be no warning
assert_warns_message(UserWarning,
"max_samples will be set to n_samples for estimation",
IsolationForest(max_samples=1000).fit, X)
assert_no_warnings(IsolationForest(max_samples='auto').fit, X)
assert_raises(ValueError,
IsolationForest(max_samples='foobar').fit, X)
def test_iforest_performance():
"""Test Isolation Forest performs well"""
# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = np.r_[X + 2, X - 2]
X_train = X[:100]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
# fit the model
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = clf.predict(X_test)
# check that there is at most 6 errors (false positive or false negative)
assert_greater(roc_auc_score(y_test, y_pred), 0.98)
def __init__(self, _id, _config):
super(IsolationForest, self).__init__(_id, _config)
self._nb_samples = int(_config['nb_samples'])
def get_default_config():
return {
'module': IsolationForest.__name__,
'nb_samples': N_SAMPLES
}
def _get_best_detector(self, train):
detector = ensemble.IsolationForest()
detector.fit(train)
return detector
def setUp(self):
super(TestIsolationForest, self).setUp()
self.if_sml = isolation_forest.IsolationForest(
"fakeid", {"module": "fake", "nb_samples": 1000})
def test_learn_structure(self):
data = self.get_testing_data()
clf = self.if_sml.learn_structure(data)
self.assertIsInstance(clf, ensemble.IsolationForest)
User_Interface.py 文件源码
项目:yttresearch-machine-learning-algorithms-analysis
作者: gdemos01
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def exportPresentationData(classifier,action):
dir = input('Give Data Directory: ')
if int(classifier)==1:
clf = GradientBoostingClassifier()
classify(dir,clf,action)
elif int(classifier) == 2:
clf = LogisticRegression()
classify(dir,clf,action)
elif int(classifier) == 3:
clf = KNeighborsClassifier(n_neighbors=5)
classify(dir,clf,action)
elif int(classifier) == 4:
clf = DecisionTreeClassifier()
classify(dir,clf,action)
elif int(classifier) == 5:
clf = svm.LinearSVC()
classify_type2(dir,clf,action)
elif int(classifier) == 6:
clf = RandomForestClassifier()
classify(dir,clf,action)
elif int(classifier) == 7:
clf = ExtraTreesClassifier()
classify(dir,clf,action)
elif int(classifier) == 8:
clf = IsolationForest()
classify_type2(dir,clf,action)
elif int(classifier) == 9:
clf = AdaBoostClassifier(n_estimators=100)
classify(dir,clf,action)
elif int(classifier) == 10:
clf = BaggingClassifier(DecisionTreeClassifier())
classify(dir,clf,action)
elif int(classifier) == 11:
clf1 = GradientBoostingClassifier()
clf2 = AdaBoostClassifier()
clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft')
classify(dir,clf,action)
Exporter.py 文件源码
项目:yttresearch-machine-learning-algorithms-analysis
作者: gdemos01
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def exportPresentationData(classifier,action,dir):
if int(classifier)==1:
clf = GradientBoostingClassifier()
classify(dir,clf,action)
elif int(classifier) == 2:
clf = LogisticRegression()
classify(dir,clf,action)
elif int(classifier) == 3:
clf = KNeighborsClassifier(n_neighbors=5)
classify(dir,clf,action)
elif int(classifier) == 4:
clf = DecisionTreeClassifier()
classify(dir,clf,action)
elif int(classifier) == 5:
clf = svm.LinearSVC()
classify_type2(dir,clf,action)
elif int(classifier) == 6:
clf = RandomForestClassifier()
classify(dir,clf,action)
elif int(classifier) == 7:
clf = ExtraTreesClassifier()
classify(dir,clf,action)
elif int(classifier) == 8:
clf = IsolationForest()
classify_type2(dir,clf,action)
elif int(classifier) == 9:
clf = AdaBoostClassifier(n_estimators=100)
classify(dir,clf,action)
elif int(classifier) == 10:
clf = BaggingClassifier(DecisionTreeClassifier())
classify(dir,clf,action)
elif int(classifier) == 11:
clf1 = GradientBoostingClassifier()
clf2 = AdaBoostClassifier()
clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft')
classify(dir,clf,action)
onlinedetectWithlittleData.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def init(idlist,d,dblack,winsize=50):
data = getdata()
loadvalue(data, d,dblack)
outvalue = extract(d,idlist)
print len(outvalue)
reshapevalue = np.array(outvalue).reshape(1,-1)
window = DataFrame(reshapevalue)
buf = []#################
while True:
print "fetching at %s" %ctime()
data = getdata()
loadvalue(data, d,dblack)
outvalue = extract(d,idlist)
reshapevalue = np.array(outvalue).reshape(1,-1)
window = window.append(DataFrame(reshapevalue))#??dataframe???1row * xcolums
buf.append(DataFrame(reshapevalue))
print len(window)
if len(window) > winsize:
break
sleep(5)
ilf = IsolationForest(n_estimators=100,verbose=2,)
ilf.fit(window)
print ilf.predict(window)
print "__________________"
for i in buf:
print ilf.predict(i)
return ilf,window
def updateWindow(buf, cont):
ilf = IsolationForest(n_estimators=100, contamination=cont)
ilf.fit(buf) # ??buf??????
print "isolation update finished"
return ilf
def init(l_sys, l_namenode, l_FS, l_RPC, d, dwhite, winsize=200, sleeptime=15, cont=0.01):
# ????
win_sys = []
win_namenode = []
win_FS = []
win_RPC = []
while True:
print "fetching at %s" % ctime()
data = getdata()
loadvalue(data, d, dwhite)
o_sys, o_namenode, o_FS, o_RPC = extract(d, l_sys, l_namenode, l_FS, l_RPC)
# ??????????
win_sys.append(o_sys)
win_namenode.append(o_namenode)
win_FS.append(o_FS)
win_RPC.append(o_RPC)
if len(win_sys) > winsize: # ????????????
break
sleep(sleeptime)
# ?????
ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
# ??fit
ilf_sys.fit(win_sys)
ilf_namenode.fit(win_namenode)
ilf_FS.fit(win_FS)
ilf_RPC.fit(win_RPC)
print ilf_sys.predict(win_sys)
print ilf_namenode.predict(win_namenode)
print ilf_FS.predict(win_FS)
print ilf_RPC.predict(win_RPC)
# ??????????????
return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC
def updateWindow(buf,cont):
ilf = IsolationForest(n_estimators=100,contamination=cont)
ilf.fit(buf)#??buf??????
print "isolation update finished"
return ilf
def init(l_sys,l_namenode,l_FS,l_RPC,l_queue,d,dwhite,winsize=200,sleeptime = 15,cont=0.01):
#????
win_sys = []
win_namenode = []
win_FS = []
win_RPC =[]
win_queue = []
while True:
print "fetching at %s" %ctime()
data = getdata()
loadvalue(data, d,dwhite)
o_sys,o_namenode,o_FS,o_RPC,o_queue = extract(d,l_sys,l_namenode,l_FS,l_RPC,l_queue)
#??????????
win_sys.append(o_sys)
win_namenode.append(o_namenode)
win_FS.append(o_FS)
win_RPC.append(o_RPC)
win_queue.append(o_queue)
if len(win_sys) > winsize:#????????????
break
sleep(sleeptime)
#?????
ilf_sys = IsolationForest(n_estimators=100,contamination=cont)
ilf_namenode = IsolationForest(n_estimators=100,contamination=cont)
ilf_FS = IsolationForest(n_estimators=100,contamination=cont)
ilf_RPC = IsolationForest(n_estimators=100,contamination=cont)
ilf_queue = IsolationForest(n_estimators=100,contamination=cont)
#??fit
ilf_sys.fit(win_sys)
ilf_namenode.fit(win_namenode)
ilf_FS.fit(win_FS)
ilf_RPC.fit(win_RPC)
ilf_queue.fit(win_queue)
#??????????????
return ilf_sys,ilf_namenode,ilf_FS,ilf_queue,ilf_RPC
def updateWindow(buf,cont):
ilf = IsolationForest(n_estimators=100,contamination=cont)
ilf.fit(buf)#??buf??????
print "isolation update finished"
return ilf
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont,limit):
ilf = IsolationForest(n_estimators=100, contamination=cont)
client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')
#???
data_sys = sampleWithDecay(client,limit,'select * from ganglia where w_system >0 ORDER BY time DESC limit 1500')#????limit????????
d_sys = data_sys[l_sys]
data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC limit 1500')
d_FS = data_fs[l_FS]
data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC limit 1500')
d_namenode = data_namenode[l_namenode]
data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC limit 1500')
d_RPC = data_rpc[l_RPC]
ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
#?????????
ilf_sys.fit(d_sys)
ilf_namenode.fit(d_namenode)
ilf_FS.fit(d_FS)
ilf_RPC.fit(d_RPC)
print "update finished"
return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont,limit):
ilf = IsolationForest(n_estimators=100, contamination=cont)
client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')
#???
data_sys = sampleWithDecay(client,limit,'select * from ganglia where w_system >0 ORDER BY time DESC')
d_sys = data_sys[l_sys]
data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC')
d_FS = data_fs[l_FS]
data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC')
d_namenode = data_namenode[l_namenode]
data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC')
d_RPC = data_rpc[l_RPC]
ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
#?????????
ilf_sys.fit(d_sys)
ilf_namenode.fit(d_namenode)
ilf_FS.fit(d_FS)
ilf_RPC.fit(d_RPC)
print "update finished"
return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
def init(l_sys, l_namenode, l_FS, l_RPC, sleeptime=15, cont=0.01,limit = 300):
# ?????
ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
ilf_FS = IsolationForest(n_estimators=50, contamination=cont)
ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
#??????????
client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')
data_sys = sampleWithDecay(client, limit, 'select * from ganglia where w_system >0 ORDER BY time DESC')
d_sys = data_sys[l_sys]
data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC')
d_FS = data_fs[l_FS]
data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC')
d_namenode = data_namenode[l_namenode]
data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC')
d_RPC = data_rpc[l_RPC]
print len(d_sys)
print len(d_FS)
print len(d_namenode)
print len(d_RPC)
# ??fit
ilf_sys.fit(d_sys)
ilf_namenode.fit(d_namenode)
ilf_FS.fit(d_FS)
ilf_RPC.fit(d_RPC)
print ilf_FS.predict(d_FS)
return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC
def updateWindow(buf,cont):
ilf = IsolationForest(n_estimators=100,contamination=cont)
ilf.fit(buf)#??buf??????
print "isolation update finished"
return ilf
def test_iforest():
"""Check Isolation Forest for various parameter settings."""
X_train = np.array([[0, 1], [1, 2]])
X_test = np.array([[2, 1], [1, 1]])
grid = ParameterGrid({"n_estimators": [3],
"max_samples": [0.5, 1.0, 3],
"bootstrap": [True, False]})
with ignore_warnings():
for params in grid:
IsolationForest(random_state=rng,
**params).fit(X_train).predict(X_test)