def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True):
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ""
def normalize_fare(data):
new_data = None
for embarked in (0, 1, 2):
temp = data[data.Embarked == embarked]
temp['Fare'] /= temp['Fare'].values.mean()
if new_data is None:
new_data = temp
else:
new_data = pd.concat([new_data, temp])
new_data = new_data.sort('PassengerId')
return new_data
data = pd.read_csv(self.file_name).replace('male',0).replace('female',1)
data['Age'].fillna(data.Age.median(), inplace=True)
data['Fare'].fillna(data.Fare.median(), inplace=True)
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2)
data['Embarked'].fillna(0, inplace=True)
if norm_fare:
data = normalize_fare(data)
# Get all the titles and print how often each one occurs.
titles = data["Name"].apply(get_title)
print(pd.value_counts(titles))
# Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
titles[titles == k] = v
# Add in the title column.
data['Title'] = titles
data['Title'].fillna(1, inplace=True)
#data['Pos'] = data["Title"] + data['Pclass']
if drop:
#data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1)
data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)
#data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1)
print(data.keys())
if title_to_onehot:
self.encode(data, 'Title', [i for i in range(1, 11)])
data = data.drop(['Title'], axis=1)
return data
评论列表
文章目录