def read_data(file=file_path):
col_names = ['System-Id', 'Message', 'drug-offset-start', 'drug-offset-end', 'sideEffect-offset-start',
'sideEffect-offset-end', 'WM1', 'WM2', 'relType']
data_frame = pd.read_csv(file, skipinitialspace=True, usecols=col_names)
mssg_frame = data_frame['Message'].drop_duplicates()
tokenizer = TweetTokenizer()
string = []
for mssg in mssg_frame:
tokens = tokenizer.tokenize(mssg)
for token in tokens:
if is_word(token):
string.append(token.lower())
if not os.path.isfile("words.txt"):
with open("words.txt", "w") as text_file:
print(string, file=text_file)
return data_frame
# TODO use space splitter and then strip the word
# TODO change regex to [a-z0-9].+
评论列表
文章目录