def processFile(file_name):
# read file from provided folder path
f = open(file_name,'r')
text_0 = f.read()
# extract content in TEXT tag and remove tags
text_1 = re.search(r"<TEXT>.*</TEXT>",text_0, re.DOTALL)
text_1 = re.sub("<TEXT>\n","",text_1.group(0))
text_1 = re.sub("\n</TEXT>","",text_1)
# replace all types of quotations by normal quotes
text_1 = re.sub("\n"," ",text_1)
text_1 = re.sub("\"","\"",text_1)
text_1 = re.sub("''","\"",text_1)
text_1 = re.sub("``","\"",text_1)
text_1 = re.sub(" +"," ",text_1)
# segment data into a list of sentences
sentence_token = nltk.data.load('tokenizers/punkt/english.pickle')
lines = sentence_token.tokenize(text_1.strip())
# setting the stemmer
sentences = []
porter = nltk.PorterStemmer()
# modelling each sentence in file as sentence object
for line in lines:
# original words of the sentence before stemming
originalWords = line[:]
line = line.strip().lower()
# word tokenization
sent = nltk.word_tokenize(line)
# stemming words
stemmedSent = [porter.stem(word) for word in sent]
stemmedSent = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'"
and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmedSent)
# list of sentence objects
if stemmedSent != []:
sentences.append(sentence.sentence(file_name, stemmedSent, originalWords))
return sentences
#---------------------------------------------------------------------------------
# Description : Function to find the term frequencies of the words in the
# sentences present in the provided document cluster
# Parameters : sentences, sentences of the document cluster
# Return : dictonary of word, term frequency score
#---------------------------------------------------------------------------------
mmr_summarizer.py 文件源码
python
阅读 20
收藏 0
点赞 0
评论 0
评论列表
文章目录