b_TFIDF_edited.py 文件源码-python代码片段

def initializeData(data): 
#    graphics_train = fetch_20newsgroups(subset = dataSet,\
#    categories = categories, shuffle = True, random_state = 42)

    wnl = WordNetLemmatizer()
    stop_words = text.ENGLISH_STOP_WORDS

    data = data 
    #List of dicts, each element represents word to number mapping for each document
    termDictList = []
    #Dictionary for each term which stores the number of documents that contains this term
    termDocCountDict = {}
    # set of term 
    termSet = set()
    # list of int, each element represents total number of terms in each tokenlized documment
    termCountList = []    

    # get focument frequency for each term
    for i in range(len(data)):
        document = data[i].lower()
        words = set(word_tokenize(document))
        for word in words:
            if word.isalpha():
                term = wnl.lemmatize(word)
                if term not in stop_words:
                    if term not in termDocCountDict:
                        termDocCountDict[term] = 0
                    termDocCountDict[term] += 1

    # get termDict and termSet
    for i in range(len(data)):
        termDict = {}
        termCount = 0
        document = data[i].lower()
        words = word_tokenize(document)
        for word in words:
            if word.isalpha():
                term = wnl.lemmatize(word)
                if term not in stop_words:
                    if term in termDocCountDict:
                        if termDocCountDict[term] >= 110 and termDocCountDict[term] <= 11000:
                            termSet.add(term)
                            termCount += 1
                            # fill in termDict
                            if term not in termDict:
                                termDict[term] = 0
                            termDict[term] += 1
                        else:
                            del termDocCountDict[term]
        termDictList.append(termDict)
        termCountList.append(termCount)

    return (termDictList, termCountList, termDocCountDict, termSet)

# function