simplePyLex.py 文件源码

python
阅读 19 收藏 0 点赞 0 评论 0

项目:token-rnn-tensorflow 作者: aalmendoza 项目源码 文件源码
def get_tokenization(lexedWoComments, lexer):
    tokenized_string = ''
    token_types = []
    curr_line_empty = True
    for t in lexedWoComments:
        token_type = str(t[0])
        token = t[1]
        token_stripped = token.strip()

        # Pygments will sometimes lex many tokens as one
        # This can occur with preprocessor directives and definitions in C
        # In this case, we need to lex that whole line
        num_tokens = len(token.split())
        if num_tokens > 1:
            # Need to manually lex each space seperated token on occassions
            # when pygments doesn't lex properly
            line_split = token.split()
            line_lexed = []
            for temp_token in line_split:
                token_lexed = list(lex(temp_token, lexer))
                for lexed in token_lexed:
                    if lexed[1] != "\n":
                        line_lexed.append(lexed)
            line_lexed.append((Token.Text, '\n'))
            line_code, line_types = get_tokenization(line_lexed, lexer)
            tokenized_string += line_code
            token_types += line_types
            curr_line_empty = True
            continue

        if '\n' in token:
            if curr_line_empty:
                if (t[0] != Token.Text or t[0] != Token.Comment.Preproc) and token_stripped != '':
                    tokenized_string += token_stripped + "\n"
                    token_types.append(token_type)
            else:
                tokenized_string += token_stripped + "\n"

                # Edge case for stray "\" in code
                if token_stripped == "\\":
                    token_types.append(token_type)
            curr_line_empty = True
        elif t[0] != Token.Text and len(token_stripped) > 0:
            curr_line_empty = False
            tokenized_string += token + ' '
            token_types.append(token_type)

    assert len(tokenized_string.split()) == len(token_types), "{0} != {1}".format(len(tokenized_string.split()), len(token_types))
    return tokenized_string, token_types
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号