def main():
# load files
# TODO: json loading is different every time, use object_pairs_hook?
# https://docs.python.org/3/library/json.html#json.load
with open('../slack-data/users.json', 'r', encoding='utf-8') as users_json:
users = json.load(users_json)
with open('../slack-data/channels.json', 'r', encoding='utf-8') as channels_json:
channels = json.load(channels_json)
with open('../slack-data/privateChannels.json', 'r', encoding='utf-8') as private_channels_json:
private_channels = json.load(private_channels_json)
# merge channels with private channels
channels = channels + private_channels
# merge from "per-channel" to "per-user" messages collection
users_messages = flatten_messages(channels)
# remove users with not enough messages as over-sampling their messages can lead to overfitting
users_messages = discard_insufficient_data_users(users_messages, users)
# stem words in messages
users_messages = stem_messages(users_messages)
# make all remained users have equal number of messages
users_messages = balance_messages(users_messages)
messages_output = []
authors_output = []
for user_id, messages in users_messages.items():
for message in messages:
authors_output.append(user_index_by_id(user_id, users))
messages_output.append(message)
pickle.dump(messages_output, open('messages.pkl', 'wb'))
pickle.dump(authors_output, open('authors.pkl', 'wb'))
print('Saved a total of ' + str(len(messages_output)) + ' processed messages')
json-to-pkl.py 文件源码
python
阅读 29
收藏 0
点赞 0
评论 0
评论列表
文章目录