def loadDataSet(filename):
sheet = 1
df = pd.read_excel(filename,sheetname=[sheet], header=None, skiprows=1)[sheet]
df = df.dropna(how='any',thresh=df.shape[1]/2) # drop those rows
df = df.dropna(how='any')
df = df.fillna(0)
df[2] = df[2]/100000000 # ?????
df = df.sort_values(2).reset_index()
# zeros = df[df[0]==0]
# df = df.drop(zeros.index,axis=0)
df[2] = (df[2]) #?????
df[3] = standard(df[3]) #????
df[4] = standard(df[4]) #????
rate_type,rate_dict = transcoding(df[5]) # ????
df[6] = standard(df[6]) #??????
# market,market_dict = transcoding(df[7]) #????
platform,platform_dict = transcoding(df[8]) #????
df[9] = df[9].apply(map_01) # ?????
df[11] = df[11].apply(map_01) # ??????
nature,nature_dict = transcoding(df[12]) #????
df[14] = standard(df[14]) #???????
print( df.groupby(15).size())
df[15] = standard(df[15].apply(map_rate)) #???????
print( df.groupby(15).size())
df[16] = standard(df[16].apply(map_sub_rate)) #?????????
df[17] = standard(df[17]) #??????
target = df[2]
data = df[[3,4,6,9,11,14,15,16,17]]
# data = pd.concat([data,rate_type,platform,nature],axis=1)
import seaborn as sns
sns.pairplot(df, x_vars=[3,17,4,14,15,16,6,9,11], y_vars=2, size=5, aspect=0.8, kind='reg')
# sns.pairplot(df, vars=[2,4,14,15,17])
return np.mat(data),np.mat(target).T
评论列表
文章目录