def read_yelp(file_name='yelp_academic_dataset_review.json'):
f = open(file_name)
f = f.readlines()
f = [eval(l.strip()) for l in f]
stars = [i['stars'] for i in f]
text = [i['text'] for i in f]
df = pd.DataFrame()
df['stars'] = stars
df['text'] = text
#compute the number of sentences in each doc
l = list(df.text)
text = [sent_tokenize(i) for i in list(df.text)]
text_len = [len(i) for i in text]
#2225188 in total
#2089287 for length<=20
#1654640 for length<=10
#We decide to only consider length<=7 here
df['length'] = text_len
df['text_split'] = text
return df
评论列表
文章目录