def tokenize_code(code, lexer, language, literal_option):
tokens = lex(code, lexer)
tokensList = list(tokens)
# Strip comments and alter strings
lexedWoComments = tokensExceptTokenType(tokensList, Token.Comment, retainedTypes=[Token.Comment.Preproc, Token.Comment.PreprocFile])
lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Literal.String.Doc)
lexedWoComments = fixTypes(lexedWoComments, language) #Alter the pygments lexer types to be more comparable between our languages
lexedWoComments = convertNamespaceTokens(lexedWoComments, language)
lexedWoComments = fix_preprocessor_defs(lexedWoComments, lexer)
lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Comment, retainedTypes=[Token.Comment.Preproc, Token.Comment.PreprocFile])
if(literal_option == 0):
lexedWoComments = modifyStrings(lexedWoComments, underscoreString)
elif(literal_option == 1):
lexedWoComments = modifyStrings(lexedWoComments, singleStringToken)
elif(literal_option == 2):
lexedWoComments = modifyStrings(lexedWoComments, spaceString)
elif(literal_option == 3):
lexedWoComments = modifyStrings(lexedWoComments, singleStringToken)
lexedWoComments = collapseStrings(lexedWoComments)
lexedWoComments = modifyNumbers(lexedWoComments, singleNumberToken)
return get_tokenization(lexedWoComments, lexer)
# source_file: path of source file to be tokenized
# language: programming language of source file, e.g. "c"
# literal_option:
# 0 -> replace all spaces in strings with _
# 1 -> replace all strings with a <str> tag
# 2 -> add spaces to the ends of the strings
# 3 -> collapse strings to <str> and collapses numbers to a type as well.
评论列表
文章目录