def test_iforest_sparse():
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"bootstrap": [True, False]})
for sparse_format in [csc_matrix, csr_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
for params in grid:
# Trained on sparse format
sparse_classifier = IsolationForest(
n_estimators=10, random_state=1, **params).fit(X_train_sparse)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_classifier = IsolationForest(
n_estimators=10, random_state=1, **params).fit(X_train)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
assert_array_equal(sparse_results, dense_results)
python类IsolationForest()的实例源码
def test_recalculate_max_depth():
"""Check that max_depth is recalculated when max_samples is reset to n_samples"""
X = iris.data
clf = IsolationForest().fit(X)
for est in clf.estimators_:
assert_equal(est.max_depth, int(np.ceil(np.log2(X.shape[0]))))
def test_max_samples_attribute():
X = iris.data
clf = IsolationForest().fit(X)
assert_equal(clf.max_samples_, X.shape[0])
clf = IsolationForest(max_samples=500)
assert_warns_message(UserWarning,
"max_samples will be set to n_samples for estimation",
clf.fit, X)
assert_equal(clf.max_samples_, X.shape[0])
clf = IsolationForest(max_samples=0.4).fit(X)
assert_equal(clf.max_samples_, 0.4*X.shape[0])
def test_iforest_works():
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
# Test LOF
clf = IsolationForest(random_state=rng)
clf.fit(X)
pred = clf.predict(X)
# assert detect outliers:
assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
def isolationForest(self, settings, mname, data):
'''
:param settings: -> settings dictionary
:param mname: -> name of serialized cluster
:return: -> isolation forest instance
:example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False,
max_features:1.0, n_jobs:1, random_state:None, verbose:0}
'''
# rng = np.random.RandomState(42)
if settings['random_state'] == 'None':
settings['random_state'] = None
if isinstance(settings['bootstrap'], str):
settings['bootstrap'] = str2Bool(settings['bootstrap'])
if isinstance(settings['verbose'], str):
settings['verbose'] = str2Bool(settings['verbose'])
if settings['max_samples'] != 'auto':
settings['max_samples'] = int(settings['max_samples'])
# print type(settings['max_samples'])
for k, v in settings.iteritems():
logger.info('[%s] : [INFO] IsolationForest %s set to %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
print "IsolationForest %s set to %s" % (k, v)
try:
clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'],
max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose'])
except Exception as inst:
logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
print "Error while instanciating isolation forest with %s and %s" % (type(inst), inst.args)
sys.exit(1)
# clf = IsolationForest(max_samples=100, random_state=rng)
# print "*&*&*&& %s" % type(data)
try:
clf.fit(data)
except Exception as inst:
logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
sys.exit(1)
predict = clf.predict(data)
print "Anomaly Array:"
print predict
self.__serializemodel(clf, 'isoforest', mname)
return clf
def detect(self, method, model, data):
'''
:param method: -> method name
:param model: -> trained clusterer
:param data: -> dataframe with data
:return: -> dictionary that contains the list of anomalous timestamps
'''
smodel = self.__loadClusterModel(method, model)
anomalieslist = []
if not smodel:
dpredict = 0
else:
if data.shape[0]:
if isinstance(smodel, IsolationForest):
print "Detected IsolationForest model"
print "Contamination -> %s" % smodel.contamination
print "Max_Features -> %s" % smodel.max_features
print "Max_Samples -> %s" % smodel.max_samples_
print "Threashold -> %s " % smodel.threshold_
try:
dpredict = smodel.predict(data)
print "IsolationForest Prediction Array -> %s" %str(dpredict)
except Exception as inst:
logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
dpredict = 0
elif isinstance(smodel, DBSCAN):
print "Detected DBSCAN model"
print "Leaf_zise -> %s" % smodel.leaf_size
print "Algorithm -> %s" % smodel.algorithm
print "EPS -> %s" % smodel.eps
print "Min_Samples -> %s" % smodel.min_samples
print "N_jobs -> %s" % smodel.n_jobs
try:
dpredict = smodel.fit_predict(data)
except Exception as inst:
logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
inst.args)
dpredict = 0
else:
dpredict = 0
logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]),
str(data.shape[1]))
print "Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]),
str(data.shape[1]))
print "dpredict type is %s" % (type(dpredict))
if type(dpredict) is not int:
anomalyarray = np.argwhere(dpredict == -1)
for an in anomalyarray:
anomalies = {}
anomalies['utc'] = int(data.iloc[an[0]]['key'])
anomalies['hutc'] = ut2hum(int(data.iloc[an[0]]['key']))
anomalieslist.append(anomalies)
anomaliesDict = {}
anomaliesDict['anomalies'] = anomalieslist
logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict))
return anomaliesDict
def online_detect():
df = pd.read_csv('ganglia.csv')
#???
maxContainSize = 500
window = df[1000:]
ilf = IsolationForest(n_estimators=100,verbose=2,)
ilf.fit(window)
print ilf.predict(window)
analomyNum = 0
allanalomy = 0
outcome = []
lable = []
k = 3#????
d = {}
buf = []
idlist,namelist = loadname()
savename(namelist,idlist)
print "initial finished"
counter = 1
while True:
print "fetching at %s" %ctime()
data = getdata()
loadvalue(data, d)
outvalue = extract(d,idlist)
reshapevalue = np.array(outvalue).reshape(1,-1)
predictValue = ilf.predict(reshapevalue)
print "predict:",predictValue
a = int(predictValue)
outcome.append(a)
lable.append(a)
buf.append(DataFrame(reshapevalue))#??dataframe???1row * xcolums
if a == -1:
analomyNum += 1
allanalomy += 1
#????????
if warn(buf,lable,k):
lable[-1] = 1 #????????????
analyseWarn(buf,outcome,k,namelist)#???????
updateWindow(window, buf, maxContainSize)
if detectUpdate(buf, 0.87, maxContainSize, analomyNum):#0.087
del ilf
window,ilf = updateWindow(window, buf, maxContainSize)
analomyNum = 0
del buf
buf = []
counter += 1
if counter %5000 ==0:
break
sleep(15)
def init(l_sys, l_namenode, l_FS, l_RPC, d, dwhite, winsize=200, sleeptime=15, cont=0.01,limit = 300):
win_sys = []
win_namenode = []
win_FS = []
win_RPC = []
while True:
print "fetching at %s" % ctime()
data = getdata()
loadvalue(data, d, dwhite)
o_sys, o_namenode, o_FS, o_RPC = extract(d, l_sys, l_namenode, l_FS, l_RPC)
# ??????????
win_sys.append(o_sys)
win_namenode.append(o_namenode)
win_FS.append(o_FS)
win_RPC.append(o_RPC)
if len(win_sys) > winsize: # ????????????
break
sleep(sleeptime)
# ?????
ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
#??????????
client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')
data_sys = sampleWithDecay(client, limit, 'select * from ganglia where w_system >0 ORDER BY time DESC limit 1500')#??????
d_sys = data_sys[l_sys]
data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC limit 1500')
d_FS = data_fs[l_FS]
data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC limit 1500')
d_namenode = data_namenode[l_namenode]
data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC limit 1500')
d_RPC = data_rpc[l_RPC]
#????????
append_sys = pd.DataFrame(win_sys,columns=l_sys)
append_namenode = pd.DataFrame(win_namenode, columns=l_namenode)
append_FS = pd.DataFrame(win_FS, columns=l_FS)
append_RPC = pd.DataFrame(win_RPC, columns=l_RPC)
out_sys = pd.concat([d_sys,append_sys])
out_namenode = pd.concat([d_namenode,append_namenode])
out_FS = pd.concat([d_FS,append_FS])
out_RPC = pd.concat([d_RPC,append_RPC])
# ??fit
ilf_sys.fit(out_sys)
ilf_namenode.fit(out_namenode)
ilf_FS.fit(out_FS)
ilf_RPC.fit(out_RPC)
print ilf_sys.predict(win_sys)
print ilf_namenode.predict(win_namenode)
print ilf_FS.predict(win_FS)
print ilf_RPC.predict(win_RPC)
return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC