def load_20news(setName):
newsgroups_subset = fetch_20newsgroups(subset=setName, remove=('headers', 'footers')) #, 'quotes'
totalLineNum = 0
readDocNum = 0
print "Loading 20 newsgroup %s data..." %setName
setDocNum = len(newsgroups_subset.data)
orig_docs_name = []
orig_docs_cat = []
orig_docs_words = []
catNum = len(newsgroups_subset.target_names)
cats_docsWords = [ [] for i in xrange(catNum) ]
cats_docNames = [ [] for i in xrange(catNum) ]
emptyFileNum = 0
for d, text in enumerate(newsgroups_subset.data):
if d % 50 == 49 or d == setDocNum - 1:
print "\r%d %d\r" %( d + 1, totalLineNum ),
text = text.encode("utf-8")
lines = text.split("\n")
if len(text) == 0 or len(lines) == 0:
emptyFileNum += 1
continue
readDocNum += 1
totalLineNum += len(lines)
catID = newsgroups_subset.target[d]
category = newsgroups_subset.target_names[catID]
text = " ".join(lines)
wordsInSentences, wc = extractSentenceWords(text)
filename = newsgroups_subset.filenames[d]
filename = os.path.basename(filename)
orig_docs_words.append( wordsInSentences )
orig_docs_name.append(filename)
orig_docs_cat.append(catID)
cats_docsWords[catID].append(wordsInSentences)
cats_docNames[catID].append(filename)
print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
cats_docsWords, cats_docNames, newsgroups_subset.target_names
评论列表
文章目录