data_helpers.py 文件源码-python代码片段

data_helpers.py 文件源码

python

阅读 30 收藏 0 点赞 0 评论 0

项目：Relation_Extraction 作者: wadhwasahil 项目源码文件源码

def read_data(file=file_path):
    col_names = ['System-Id', 'Message', 'drug-offset-start', 'drug-offset-end', 'sideEffect-offset-start',
                 'sideEffect-offset-end', 'WM1', 'WM2', 'relType']
    data_frame = pd.read_csv(file, skipinitialspace=True, usecols=col_names)
    mssg_frame = data_frame['Message'].drop_duplicates()
    tokenizer = TweetTokenizer()
    string = []
    for mssg in mssg_frame:
        tokens = tokenizer.tokenize(mssg)
        for token in tokens:
            if is_word(token):
                string.append(token.lower())
    if not os.path.isfile("words.txt"):
        with open("words.txt", "w") as text_file:
            print(string, file=text_file)
    return data_frame


# TODO use space splitter and then strip the word
# TODO change regex to [a-z0-9].+