def initializeData(data):
# graphics_train = fetch_20newsgroups(subset = dataSet,\
# categories = categories, shuffle = True, random_state = 42)
wnl = WordNetLemmatizer()
stop_words = text.ENGLISH_STOP_WORDS
data = data
#List of dicts, each element represents word to number mapping for each document
termDictList = []
#Dictionary for each term which stores the number of documents that contains this term
termDocCountDict = {}
# set of term
termSet = set()
# list of int, each element represents total number of terms in each tokenlized documment
termCountList = []
# get focument frequency for each term
for i in range(len(data)):
document = data[i].lower()
words = set(word_tokenize(document))
for word in words:
if word.isalpha():
term = wnl.lemmatize(word)
if term not in stop_words:
if term not in termDocCountDict:
termDocCountDict[term] = 0
termDocCountDict[term] += 1
# get termDict and termSet
for i in range(len(data)):
termDict = {}
termCount = 0
document = data[i].lower()
words = word_tokenize(document)
for word in words:
if word.isalpha():
term = wnl.lemmatize(word)
if term not in stop_words:
if term in termDocCountDict:
if termDocCountDict[term] >= 110 and termDocCountDict[term] <= 11000:
termSet.add(term)
termCount += 1
# fill in termDict
if term not in termDict:
termDict[term] = 0
termDict[term] += 1
else:
del termDocCountDict[term]
termDictList.append(termDict)
termCountList.append(termCount)
return (termDictList, termCountList, termDocCountDict, termSet)
# function
评论列表
文章目录