LDAModel_English.py 文件源码-python代码片段

def __tokenizeWholeCorpora(self,pathToCorpora):
        print 'Start tokenzing the corpora: %s' % (pathToCorpora)
        punct = re.compile('[%s]' % re.escape(string.punctuation))
        wnl = WordNetLemmatizer()
        doc_count=0
        train_set = []
        doc_mapping = {}
        link_mapping = {}

        for f in glob(pathToCorpora+'/*'):
                filereader = open(f, 'r')
                article = filereader.readlines();filereader.close()
                text = ''
                try:
                    link = article[0]
                    title = article[1]
                    text = article[2].lower()
                except IndexError:
                    continue

                # Skip document length < min_length
                if len(text) < self.min_length:
                    continue
                text = punct.sub("",text)  # Remove all punctuations
                tokens = nltk.word_tokenize(text)  # Tokenize the whole text
                # Lemmatize every word and add to tokens list if the word is not in stopword
                train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword]) 
                # Build doc-mapping
                doc_mapping[doc_count] = title
                link_mapping[doc_count] = link
                doc_count = doc_count+1
                if doc_count % 10000 == 0:
                    print 'Have processed %i documents' % (doc_count)

        print 'Finished tokenzing the copora: %s' % (pathToCorpora)
        return doc_count,train_set,doc_mapping,link_mapping