def read_scraped_jason(filename):
df = pd.read_json(filename)
for column in df.columns:
df[column] = df[column].apply(unlist)
# gets only first 10 characters of date: year/month/day
df['date'] = df['date'].apply(lambda x: x[:10])
df['date'] = pd.to_datetime(df['date'])
# if any removes duplicate posts
df = df.drop_duplicates(subset = ['keywords'])
# sorts dataframe by post date
df = df.sort_values(by='date')
df = df.drop('body', 1)
df = df.drop('title', 1)
df['keywords'].replace('', np.nan, inplace=True)
df = df.dropna()
return df
评论列表
文章目录