simplePyLex.py 文件源码-python代码片段

def tokenize_code(code, lexer, language, literal_option):
    tokens = lex(code, lexer)
    tokensList = list(tokens)

    # Strip comments and alter strings
    lexedWoComments = tokensExceptTokenType(tokensList, Token.Comment, retainedTypes=[Token.Comment.Preproc, Token.Comment.PreprocFile])
    lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Literal.String.Doc)
    lexedWoComments = fixTypes(lexedWoComments, language) #Alter the pygments lexer types to be more comparable between our languages
    lexedWoComments = convertNamespaceTokens(lexedWoComments, language)
    lexedWoComments = fix_preprocessor_defs(lexedWoComments, lexer)
    lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Comment, retainedTypes=[Token.Comment.Preproc, Token.Comment.PreprocFile])

    if(literal_option == 0):
        lexedWoComments = modifyStrings(lexedWoComments, underscoreString)
    elif(literal_option == 1):
        lexedWoComments = modifyStrings(lexedWoComments, singleStringToken)
    elif(literal_option == 2):
        lexedWoComments = modifyStrings(lexedWoComments, spaceString)
    elif(literal_option == 3):
        lexedWoComments = modifyStrings(lexedWoComments, singleStringToken)
        lexedWoComments = collapseStrings(lexedWoComments)
        lexedWoComments = modifyNumbers(lexedWoComments, singleNumberToken)

    return get_tokenization(lexedWoComments, lexer)

# source_file: path of source file to be tokenized
# language: programming language of source file, e.g. "c"
# literal_option:
#   0 -> replace all spaces in strings with _
#   1 -> replace all strings with a <str> tag
#   2 -> add spaces to the ends of the strings
#   3 -> collapse strings to <str> and collapses numbers to a type as well.