LexRank.py 文件源码-python代码片段

def processFile(self, file_path_and_name):
        try:

            f = open(file_path_and_name,'rb')
            text = f.read()

            # soup = BeautifulSoup(text,"html.parser")
            # text = soup.getText()
            # text = re.sub("APW19981212.0848","",text)
            # text = re.sub("APW19981129.0668","",text)
            # text = re.sub("NEWSWIRE","",text)
            text_1 = re.search(r"<TEXT>.*</TEXT>",text, re.DOTALL)
            text_1 = re.sub("<TEXT>\n","",text_1.group(0))
            text_1 = re.sub("\n</TEXT>","",text_1)

            # replace all types of quotations by normal quotes
            text_1 = re.sub("\n"," ",text_1)
            text_1 = re.sub(" +"," ",text_1)
            # text_1 = re.sub("\'\'","\"",text_1)
            # text_1 = re.sub("\`\`","\"",text_1)


            sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

            lines = sent_tokenizer.tokenize(text_1.strip())
            text_1 = lines

            sentences = []
            porter = nltk.PorterStemmer()

            for sent in lines:
                OG_sent = sent[:]
                sent = sent.strip().lower()
                line = nltk.word_tokenize(sent)

                stemmed_sentence = [porter.stem(word) for word in line]
                stemmed_sentence = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'"
                                    and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmed_sentence)
                if stemmed_sentence != []:
                    sentences.append(sentence(file_path_and_name, stemmed_sentence, OG_sent))

            return sentences


        except IOError:
            print 'Oops! File not found',file_path_and_name
            return [sentence(file_path_and_name, [],[])]