def tweets(word_len, sent_len, train_valid_ratio=[5,1]):
df = pandas.read_csv('tweets_large.csv')
field = 'text'
label = 'label'
tokenizer = RegexpTokenizer(r'\w+')
# encode characters into numbers
encoder = CharNumberEncoder(df[field].values, tokenizer=tokenizer,
word_len=word_len, sent_len=sent_len)
encoder.build_char_map()
encode_X = encoder.make_char_embed()
# encode categories into one hot array
cat_encoder = CatNumberEncoder(df[label])
cat_encoder.build_cat_map()
encode_y = cat_encoder.make_cat_embed()
nclass = len(np.unique(encode_y))
encode_y = make_one_hot(encode_y, nclass)
return encode_X, encode_y, nclass
评论列表
文章目录