def processFile(self, file_path_and_name):
try:
f = open(file_path_and_name,'rb')
text = f.read()
# soup = BeautifulSoup(text,"html.parser")
# text = soup.getText()
# text = re.sub("APW19981212.0848","",text)
# text = re.sub("APW19981129.0668","",text)
# text = re.sub("NEWSWIRE","",text)
text_1 = re.search(r"<TEXT>.*</TEXT>",text, re.DOTALL)
text_1 = re.sub("<TEXT>\n","",text_1.group(0))
text_1 = re.sub("\n</TEXT>","",text_1)
# replace all types of quotations by normal quotes
text_1 = re.sub("\n"," ",text_1)
text_1 = re.sub(" +"," ",text_1)
# text_1 = re.sub("\'\'","\"",text_1)
# text_1 = re.sub("\`\`","\"",text_1)
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
lines = sent_tokenizer.tokenize(text_1.strip())
text_1 = lines
sentences = []
porter = nltk.PorterStemmer()
for sent in lines:
OG_sent = sent[:]
sent = sent.strip().lower()
line = nltk.word_tokenize(sent)
stemmed_sentence = [porter.stem(word) for word in line]
stemmed_sentence = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'"
and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmed_sentence)
if stemmed_sentence != []:
sentences.append(sentence(file_path_and_name, stemmed_sentence, OG_sent))
return sentences
except IOError:
print 'Oops! File not found',file_path_and_name
return [sentence(file_path_and_name, [],[])]
LexRank.py 文件源码
python
阅读 48
收藏 0
点赞 0
评论 0
评论列表
文章目录