def master_clean(df, column, html, email, punc, non_ascii, stopwords, number, remove_nonenglish, stemorlem):
if punc:
df[column] = df[column].apply(remove_punc).to_frame()
if html:
df[column] = df[column].apply(remove_html).to_frame()
if email:
df[column] = df[column].apply(remove_email).to_frame()
if non_ascii:
df[column] = df[column].apply(remove_non_ascii).to_frame()
if stopwords:
df[column] = df[column].apply(remove_stop).to_frame()
if number:
df[column] = df[column].apply(remove_numbers).to_frame()
if nonenglish:
df[column] = df[column].apply(nonenglish).to_frame()
if stemorlem == 'stem':
df[column] = df[column].apply(stemmer).to_frame()
elif stemorlem == 'lem':
df[column] = df[column].apply(lemmatizer).to_frame()
return df
评论列表
文章目录