def __tokenizeWholeCorpora(self,pathToCorpora):
print 'Start tokenzing the corpora: %s' % (pathToCorpora)
punct = re.compile('[%s]' % re.escape(string.punctuation))
wnl = WordNetLemmatizer()
doc_count=0
train_set = []
doc_mapping = {}
link_mapping = {}
for f in glob(pathToCorpora+'/*'):
filereader = open(f, 'r')
article = filereader.readlines();filereader.close()
text = ''
try:
link = article[0]
title = article[1]
text = article[2].lower()
except IndexError:
continue
# Skip document length < min_length
if len(text) < self.min_length:
continue
text = punct.sub("",text) # Remove all punctuations
tokens = nltk.word_tokenize(text) # Tokenize the whole text
# Lemmatize every word and add to tokens list if the word is not in stopword
train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword])
# Build doc-mapping
doc_mapping[doc_count] = title
link_mapping[doc_count] = link
doc_count = doc_count+1
if doc_count % 10000 == 0:
print 'Have processed %i documents' % (doc_count)
print 'Finished tokenzing the copora: %s' % (pathToCorpora)
return doc_count,train_set,doc_mapping,link_mapping
评论列表
文章目录