mmr_summarizer.py 文件源码-python代码片段

def processFile(file_name):

    # read file from provided folder path
    f = open(file_name,'r')
    text_0 = f.read()

    # extract content in TEXT tag and remove tags
    text_1 = re.search(r"<TEXT>.*</TEXT>",text_0, re.DOTALL)
    text_1 = re.sub("<TEXT>\n","",text_1.group(0))
    text_1 = re.sub("\n</TEXT>","",text_1)

    # replace all types of quotations by normal quotes
    text_1 = re.sub("\n"," ",text_1)

    text_1 = re.sub("\"","\"",text_1)
    text_1 = re.sub("''","\"",text_1)
    text_1 = re.sub("``","\"",text_1)   

    text_1 = re.sub(" +"," ",text_1)

    # segment data into a list of sentences
    sentence_token = nltk.data.load('tokenizers/punkt/english.pickle')
    lines = sentence_token.tokenize(text_1.strip()) 

    # setting the stemmer
    sentences = []
    porter = nltk.PorterStemmer()

    # modelling each sentence in file as sentence object
    for line in lines:

        # original words of the sentence before stemming
        originalWords = line[:]
        line = line.strip().lower()

        # word tokenization
        sent = nltk.word_tokenize(line)

        # stemming words
        stemmedSent = [porter.stem(word) for word in sent]      
        stemmedSent = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'" 
            and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmedSent)

        # list of sentence objects
        if stemmedSent != []:
            sentences.append(sentence.sentence(file_name, stemmedSent, originalWords))              

    return sentences

#---------------------------------------------------------------------------------
# Description   : Function to find the term frequencies of the words in the
#                 sentences present in the provided document cluster
# Parameters    : sentences, sentences of the document cluster
# Return        : dictonary of word, term frequency score
#---------------------------------------------------------------------------------