def sample_split(dbname,num_train,num_test):
client = MongoClient()
db = client[dbname]
sentisent_collection = db.sentiment_sentences
################## load and count
aspect_dist = nltk.FreqDist()
sentiment_dist = nltk.FreqDist()
all_samples = []
cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train + num_test } } ])
for index,d in enumerate(cursor):
sent = Sentence.from_dict(d)
all_samples.append( (sent.words,sent.sentiment) )
aspect_dist[sent.aspect] +=1
sentiment_dist[int(sent.sentiment)] +=1
client.close()
################## show statistics
for k in aspect_dist:
print '[{}]: {}'.format(k,aspect_dist.freq(k))
for k in sentiment_dist:
print '[{}]: {}'.format(k,sentiment_dist.freq(k))
################## shuffle
random.shuffle(all_samples)
################## split
def __dump(filename,data):
with open(filename,"wb") as outf:
cPickle.dump(data,outf)
__dump("sentidata_train_raw.pkl",all_samples[:num_train])
__dump("sentidata_test_raw.pkl",all_samples[num_train:])
extract_samples_for_sentiments.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录