corpusLoader.py 文件源码-python代码片段

def load_20news(setName):
    newsgroups_subset = fetch_20newsgroups(subset=setName, remove=('headers', 'footers')) #, 'quotes'
    totalLineNum = 0
    readDocNum = 0
    print "Loading 20 newsgroup %s data..." %setName

    setDocNum = len(newsgroups_subset.data)
    orig_docs_name = []
    orig_docs_cat = []
    orig_docs_words = []

    catNum = len(newsgroups_subset.target_names)
    cats_docsWords = [ [] for i in xrange(catNum) ]
    cats_docNames = [ [] for i in xrange(catNum) ]

    emptyFileNum = 0

    for d, text in enumerate(newsgroups_subset.data):
        if d % 50 == 49 or d == setDocNum - 1:
            print "\r%d %d\r" %( d + 1, totalLineNum ),
        text = text.encode("utf-8")
        lines = text.split("\n")
        if len(text) == 0 or len(lines) == 0:
            emptyFileNum += 1
            continue

        readDocNum += 1
        totalLineNum += len(lines)

        catID = newsgroups_subset.target[d]
        category = newsgroups_subset.target_names[catID]

        text = " ".join(lines)

        wordsInSentences, wc = extractSentenceWords(text)
        filename = newsgroups_subset.filenames[d]
        filename = os.path.basename(filename)
        orig_docs_words.append( wordsInSentences )
        orig_docs_name.append(filename)
        orig_docs_cat.append(catID)
        cats_docsWords[catID].append(wordsInSentences)
        cats_docNames[catID].append(filename)

    print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
    return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
                cats_docsWords, cats_docNames, newsgroups_subset.target_names