def preproc(self):
self.df["time_remaining"] = self.df["minutes_remaining"] * 60 + self.df["seconds_remaining"]
self.df['last_5_sec'] = self.df['time_remaining'] < 5
self.df['latter_half'] = self.df['time_remaining'] < 360
self.df['first_period'] = self.df['period'] == 1
self.df['latter_period'] = self.df['period'] > 2
self.df['last_period'] = self.df['period'] == 4
self.df['last_quarter'] = self.df['time_remaining'] < 180
threshold = 3
anomaly = 14
self.df['last_moment'] = self.df.apply(lambda row: row['time_remaining'] < threshold or row['time_remaining'] == anomaly, axis=1)
self.df['away'] = self.df.matchup.str.contains('@')
self.df['secondsFromStart'] = 60 * (11 - self.df['minutes_remaining']) + (60 - self.df['seconds_remaining'])
self.df['secondsFromGameStart'] = (self.df['period'] <= 4).astype(int) * (self.df['period'] - 1) * 12 * 60 + (self.df['period'] > 4).astype(int) * ((self.df['period'] - 4) * 5 * 60 + 3 * 12 * 60) + self.df['secondsFromStart']
numGaussians = 13
gaussianMixtureModel = mixture.GMM(n_components=numGaussians, covariance_type='full',
params='wmc', init_params='wmc',
random_state=1, n_init=3, verbose=0)
gaussianMixtureModel.fit(self.df.ix[:,['loc_x','loc_y']])
self.df['shotLocationCluster'] = gaussianMixtureModel.predict(self.df.ix[:,['loc_x','loc_y']])
self.df['homeGame'] = self.df['matchup'].apply(lambda x: 1 if (x.find('@') < 0) else 0)
self.df["game_year"] = pd.Series([int(self.df["game_date"][i][:4]) for i in range(0, len(self.df))])
self.df["game_month"] = pd.Series([int(self.df["game_date"][i][5:7]) for i in range(0, len(self.df))])
self.df["game_day"] = pd.Series([int(self.df["game_date"][i][-2:]) for i in range(0, len(self.df))])
action_type_list = list(set(self.df["action_type"].tolist()))
self.df["action_type_num"] = pd.Series([action_type_list.index(self.df["action_type"][i]) for i in range(0, len(self.df))])
combined_shot_type_list = list(set(self.df["combined_shot_type"].tolist()))
self.df["combined_shot_type_num"] = pd.Series([combined_shot_type_list.index(self.df["combined_shot_type"][i]) for i in range(0, len(self.df))])
opponent_list = list(set(self.df["opponent"].tolist()))
self.df["opponent_num"] = pd.Series([opponent_list.index(self.df["opponent"][i]) for i in range(0, len(self.df))])
game_id_list = list(set(self.df["game_id"].tolist()))
self.df["game_id_num"] = pd.Series([game_id_list.index(self.df["game_id"][i]) for i in range(0, len(self.df))])
season_list = list(set(self.df["season"].tolist()))
season_list.sort()
self.df["season_num"] = pd.Series([season_list.index(self.df["season"][i]) for i in range(0, len(self.df))])
self.df["shot_distance"][self.df["shot_distance"] > 45] = 45
# del self.df["team_id"], self.df["team_name"], self.df["game_event_id"], self.df["lat"], self.df["lon"]
# return self.df
评论列表
文章目录