def parsedata(lines, word_list, split_word_list, emoji_dict, normalize_text=False, split_hashtag=False,
ignore_profiles=False,
lowercase=False, replace_emoji=True):
data = []
for i, line in enumerate(lines):
if (i % 100 == 0):
print(str(i) + '...', end='', flush=True)
try:
# convert the line to lowercase
if (lowercase):
line = line.lower()
# split into token
token = line.split('\t')
# label
label = int(token[1].strip())
# tweet text
target_text = TweetTokenizer().tokenize(token[2].strip())
# filter text
target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, normalize_text,
split_hashtag,
ignore_profiles, replace_emoji=replace_emoji)
# awc dimensions
dimensions = []
if (len(token) > 3 and token[3].strip() != 'NA'):
dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]
# context tweet
context = []
if (len(token) > 4):
if (token[4] != 'NA'):
context = TweetTokenizer().tokenize(token[4].strip())
context = filter_text(context, word_list, normalize_text, split_hashtag, ignore_profiles)
# author
author = 'NA'
if (len(token) > 5):
author = token[5]
if (len(target_text) != 0):
# print((label, target_text, dimensions, context, author))
data.append((label, target_text, dimensions, context, author))
except:
raise
print('')
return data
评论列表
文章目录