def get_tokenization(lexedWoComments, lexer):
tokenized_string = ''
token_types = []
curr_line_empty = True
for t in lexedWoComments:
token_type = str(t[0])
token = t[1]
token_stripped = token.strip()
# Pygments will sometimes lex many tokens as one
# This can occur with preprocessor directives and definitions in C
# In this case, we need to lex that whole line
num_tokens = len(token.split())
if num_tokens > 1:
# Need to manually lex each space seperated token on occassions
# when pygments doesn't lex properly
line_split = token.split()
line_lexed = []
for temp_token in line_split:
token_lexed = list(lex(temp_token, lexer))
for lexed in token_lexed:
if lexed[1] != "\n":
line_lexed.append(lexed)
line_lexed.append((Token.Text, '\n'))
line_code, line_types = get_tokenization(line_lexed, lexer)
tokenized_string += line_code
token_types += line_types
curr_line_empty = True
continue
if '\n' in token:
if curr_line_empty:
if (t[0] != Token.Text or t[0] != Token.Comment.Preproc) and token_stripped != '':
tokenized_string += token_stripped + "\n"
token_types.append(token_type)
else:
tokenized_string += token_stripped + "\n"
# Edge case for stray "\" in code
if token_stripped == "\\":
token_types.append(token_type)
curr_line_empty = True
elif t[0] != Token.Text and len(token_stripped) > 0:
curr_line_empty = False
tokenized_string += token + ' '
token_types.append(token_type)
assert len(tokenized_string.split()) == len(token_types), "{0} != {1}".format(len(tokenized_string.split()), len(token_types))
return tokenized_string, token_types
评论列表
文章目录