python类sent_tokenize()的实例源码

regex.py 文件源码 项目:linkedin_recommend 作者: duggalr2 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def tokenize_and_stem(text):
    """
    First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    """
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if 'intern' == token:
                token = ''
            if 'student' == token:
                token = ''
            if 'and' == token:
                token = ''
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
    return stems
readdata.py 文件源码 项目:Natural-Language-Processing-Python-and-NLTK 作者: PacktPublishing 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text
representation.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def get_sentence_tokens(text):
    '''
    Given a text(review), return the token list of each sentence
    :param text:
    :return:
    '''
    sentences = sent_tokenize(text)

    sent_tokens = []
    for sentence in sentences:
        sent_token = word_tokenize(sentence)
        sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))]
        sent_tokens.append(sent_token)
    # remove stop words and short tokens

    # stemming, experiment shows that stemming works nothing...
    # if (stemming):
    #     stemmer = PorterStemmer()
    #     texts = [[ stemmer.stem(token) for token in text] for text in texts]
    return sent_tokens
gender.py 文件源码 项目:atap 作者: foxbook 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        )
freetext.py 文件源码 项目:skills-ml 作者: workforce-data-initiative 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def ie_preprocess(self, document):
        """This function takes raw text and chops and then connects the process to break
           it down into sentences"""

        # Pre-processing
        # e.g.","exempli gratia"
        document = document.replace("e.g.", "exempli gratia")

        # Sentence tokenizer out of nltk.sent_tokenize
        split = re.split('\n|\*', document)

        # Sentence tokenizer
        sentences = []
        for sent in split:
            sents = nltk.sent_tokenize(sent)
            length = len(sents)
            if length == 0:
                next
            elif length == 1:
                sentences.append(sents[0])
            else:
                for i in range(length):
                    sentences.append(sents[i])
        return sentences
sent-thoughts-words.py 文件源码 项目:reuters-docsim 作者: sujitpal 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def maybe_build_sentences(text_filename, sent_filename):
    sents = []
    if os.path.exists(sent_filename):
        fsent = open(sent_filename, "rb")
        for line in fsent:
            docid, sent_id, sent = line.strip().split("\t")
            sents.append(sent)
        fsent.close()
    else:
        ftext = open(text_filename, "rb")
        fsent = open(sent_filename, "wb")
        for line in ftext:
            docid, text = line.strip().split("\t")
            sent_id = 1
            for sent in nltk.sent_tokenize(text):
                sents.append(sent)
                fsent.write("{:d}\t{:d}\t{:s}\n"
                    .format(int(docid), sent_id, sent))
                sent_id += 1
        fsent.close()
        ftext.close()
    return sents
sent-thoughts.py 文件源码 项目:reuters-docsim 作者: sujitpal 项目源码 文件源码 阅读 44 收藏 0 点赞 0 评论 0
def maybe_build_sentences(text_filename, sent_filename):
    sents = []
    if os.path.exists(sent_filename):
        fsent = open(sent_filename, "rb")
        for line in fsent:
            docid, sent_id, sent = line.strip().split("\t")
            sents.append(sent)
        fsent.close()
    else:
        ftext = open(text_filename, "rb")
        fsent = open(sent_filename, "wb")
        for line in ftext:
            docid, text = line.strip().split("\t")
            sent_id = 1
            for sent in nltk.sent_tokenize(text):
                sents.append(sent)
                fsent.write("{:d}\t{:d}\t{:s}\n"
                    .format(int(docid), sent_id, sent))
                sent_id += 1
        fsent.close()
        ftext.close()
    return sents
reader.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_review_sentences():
    '''
    Read the yelp review and return after sentence segmentattion
    :return:
    '''
    review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8')
    count_sentence = 0
    sentences = []

    for line in review_file:
        json_review = json.loads(line.strip())
        text = json_review.get("text").replace('\n','').lower()

        raw_sentences = sent_tokenize(text)
        for raw_sentence in raw_sentences:
            if len(raw_sentence.strip()) > 0:
                sent_tokens = word_tokenize(raw_sentence)
                sentences.append(sent_tokens)
    return sentences
AKE.py 文件源码 项目:NLP-Keyword-Extraction-Ensemble-Method 作者: Ashwin-Ravi 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]
find_entities.py 文件源码 项目:Medical_NER 作者: murhafh 项目源码 文件源码 阅读 45 收藏 0 点赞 0 评论 0
def print_symptoms_from_page(url = '', model = '', stanford_jar = ''):
    html_reader = HTMLReader(url)
    cleaned_text = html_reader.get_text_from_page()
    symptoms = set()

    st = NERTagger(model, stanford_jar, encoding='utf-8')
    sentences = nltk.sent_tokenize(cleaned_text)
    for sentence in sentences:
        tags = st.tag(nltk.word_tokenize(sentence))
        tag_index = 0
        while tag_index < len(tags):
            if tags[tag_index][1] == 'SYMP':
                symptom = []
                while tag_index < len(tags) and tags[tag_index][1] != 'O':
                    symptom.append(tags[tag_index][0])
                    tag_index += 1
                symptoms.add(' '.join(symptom))
            else:
                tag_index += 1
    print "Found %d symptoms:" % len(symptoms)
    for symptom in symptoms:
        print symptom
key_extractor.py 文件源码 项目:keyphrase-extraction 作者: sagarchaturvedi1 项目源码 文件源码 阅读 42 收藏 0 点赞 0 评论 0
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    ''' This function will extract text of a specific POS sequence rather than just Noun Phrase '''

    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group)
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]
NLTKPreprocessor.py 文件源码 项目:ai-chatbot-framework 作者: alfredfrancis 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                # if token in self.stopwords:
                #     continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma
_names.py 文件源码 项目:memex-dossier-open 作者: dossier 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def process(self, fc, context=None):
        text_source = self.config.get('text_source')
        if text_source and text_source in fc:
            text = fc[text_source]
        else:
            return fc
        names = defaultdict(StringCounter)
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label'):
                    label = chunk.label()
                    name = ' '.join(c[0] for c in chunk.leaves())
                    if not isinstance(name, unicode):
                        name = unicode(name, 'utf-8')
                    name = cleanse(name)
                    #print chunk.node, name
                    names[label][name] += 1
        for entity_type, name_counts in names.items():
            fc[entity_type] = name_counts
        return fc
util.py 文件源码 项目:dialog_research 作者: wjbianjason 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def generate_vocab(filename,min_fre=5,prefix=""):
    vf = open("../data/"+prefix+"vocab_generate.txt",'w')
    word = {}
    for line in file(filename):
      line = line.strip()
      try:
        sentencesToken = nltk.sent_tokenize(line)
      except:
        continue
      for i in range(len(sentencesToken)):
          tokens = nltk.word_tokenize(sentencesToken[i])
          for token in tokens:
              word.setdefault(token,0)
              word[token] += 1
    for char,num in sorted(word.items(),key=lambda x:x[1],reverse=True):
      if num < min_fre:
        break
      vf.write(char+" "+str(num)+"\n")
kpex.py 文件源码 项目:kpex 作者: christophfeinauer 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def extract_chunks(text_string,max_words=3,lemmatize=False):

    # Any number of adjectives followed by any number of nouns and (optionally) again
    # any number of adjectives folowerd by any number of nouns
    grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'

    # Makes chunks using grammar regex
    chunker = nltk.RegexpParser(grammar)

    # Get grammatical functions of words
    # What this is doing: tag(sentence -> words)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))

    # Make chunks from the sentences, using grammar. Output in IOB.
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                        for tagged_sent in tagged_sents))
    # Join phrases based on IOB syntax.
    candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]

    # Filter by maximum keyphrase length
    candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))

    # Filter phrases consisting of punctuation or stopwords
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates))

    # lemmatize
    if lemmatize:
        lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
        candidates =  [lemmatizer(x) for x in candidates]

    return candidates
default.py 文件源码 项目:Python-NLTKWebApp 作者: alibolek 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def tokenizer():
    if len(request.vars)!=0:
        user_input=request.vars
        import sys
        reload(sys)
        sys.setdefaultencoding('utf-8')
        if user_input.parameter=="sentence":

            our_output=nltk.sent_tokenize(user_input.input,"english")
            print user_input
            if request.vars.filename!='' and len(request.vars.filename.value)!="":
                file_input=user_input.filename.value
                file_output=nltk.word_tokenize(file_input,"english")
            print our_output
        else:
            our_output=nltk.word_tokenize(user_input.input,"english")
            if request.vars.filename!='' and len(request.vars.filename.value)!=None:
                file_input=user_input.filename.value
                file_output=nltk.word_tokenize(file_input,"english")


        user_input.output=our_output


    return locals()
default.py 文件源码 项目:Python-NLTKWebApp 作者: alibolek 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def tokenizer():
    if len(request.vars)!=0:
        user_input=request.vars
        import sys
        reload(sys)
        sys.setdefaultencoding('utf-8')
        if user_input.parameter=="sentence":

            our_output=nltk.sent_tokenize(user_input.input,"english")
            print user_input
            if request.vars.filename!='' and len(request.vars.filename.value)!="":
                file_input=user_input.filename.value
                file_output=nltk.word_tokenize(file_input,"english")
            print our_output
        else:
            our_output=nltk.word_tokenize(user_input.input,"english")
            if request.vars.filename!='' and len(request.vars.filename.value)!=None:
                file_input=user_input.filename.value
                file_output=nltk.word_tokenize(file_input,"english")


        user_input.output=our_output


    return locals()
data_dictionary_shared.py 文件源码 项目:repeat-aft 作者: ripeta 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def extract(text, paper=None, logger=logger):

    search_any = functools.partial(re_util.search_any, logger=logger)
    if not text and paper:
        try:
            text, _ = paper.get_text()
        except pdfutil.pdfutil.MalformedPDF as e:
            return None
    filters = [r'data documentation.*?shared']
    for sentence in nltk.sent_tokenize(text):
        match = search_any(filters, sentence)
        if match:
            source_type = "extracted"
            source_detail = "nltk search v1"
            value_text = sentence
            value_result = "Yes"
            return (value_text, value_result, source_type, source_detail)
    #if no match found:
    source_type = "extracted"
    source_detail = "nltk search v1"
    value_text = "Not Found"
    value_result = "No"
    return (value_text, value_result, source_type, source_detail)
text_mining_source.py 文件源码 项目:repeat-aft 作者: ripeta 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def extract(text, paper=None, logger=logger):

    search_any = functools.partial(re_util.search_any, logger=logger)
    if not text and paper:
        try:
            text, _ = paper.get_text()
        except pdfutil.pdfutil.MalformedPDF as e:
            return None
    for sentence in nltk.sent_tokenize(text):
        if search_any([r'data mine.*?source', r'text mine.*?shared'], sentence):
            # yapf: disable
            match = search_any([
                "data mine.*?(\w*\d[\w\d/-]*)",
                "text mine.*?(\w*\d[\w\d/-]*)"
            ], sentence)
            # yapf: enable
            source_type = "extracted"
            source_detail = "nltk search v1"
            value_text = sentence
            try:
                value_result = match.group(1).strip()
                return (value_text, value_result, source_type, source_detail)
            except AttributeError:  # no match was found
                return None
    return None
analysis_process_clear.py 文件源码 项目:repeat-aft 作者: ripeta 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def extract(text, paper=None, logger=logger):

    search_any = functools.partial(re_util.search_any, logger=logger)
    if not text and paper:
        try:
            text, _ = paper.get_text()
        except pdfutil.pdfutil.MalformedPDF as e:
            return None
    filters = [r'analys(is|es)']
    for sentence in nltk.sent_tokenize(text):
        match = search_any(filters, sentence)
        if match and search_any([r'algorithm', r'summary', r'outline', r'statistic', r'table|graph', r'following'], sentence):
            source_type = "extracted"
            source_detail = "nltk search v1"
            value_text = sentence
            value_result = "Yes"
            return (value_text, value_result, source_type, source_detail)
    #if no match found:
    source_type = "extracted"
    source_detail = "nltk search v1"
    value_text = "Not Found"
    value_result = "No"
    return (value_text, value_result, source_type, source_detail)
kaggle.py 文件源码 项目:dl-models-for-qa 作者: sujitpal 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples
sent-thoughts-parse.py 文件源码 项目:Deep-Learning-with-Keras 作者: PacktPublishing 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def maybe_build_vocab(reuters_dir, vocab_file):
    vocab = collections.defaultdict(int)
    if os.path.exists(vocab_file):
        fvoc = open(vocab_file, "rb")
        for line in fvoc:
            word, idx = line.strip().split("\t")
            vocab[word] = int(idx)
        fvoc.close()
    else:
        counter = collections.Counter()
        num_docs_read = 0
        for doc in stream_reuters_documents(reuters_dir):
            if num_docs_read % 100 == 0:
                print("building vocab from {:d} docs"
                    .format(num_docs_read))
            topics = doc["topics"]
            if len(topics) == 0:
                continue
            title = doc["title"]
            body = doc["body"]
            title_body = ". ".join([title, body]).lower()
            for sent in nltk.sent_tokenize(title_body):
                for word in nltk.word_tokenize(sent):
                    counter[word] += 1
            for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
                vocab[c[0]] = i + 1
            num_docs_read += 1
        print("vocab built from {:d} docs, complete"
            .format(num_docs_read))
        fvoc = open(vocab_file, "wb")
        for k in vocab.keys():
            fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
        fvoc.close()
    return vocab
sent-thoughts-parse.py 文件源码 项目:Deep-Learning-with-Keras 作者: PacktPublishing 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def build_numeric_text(vocab, text):
    wids = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            wids.append(vocab[word])
    return ",".join([str(x) for x in wids])


##################### main ######################
new_classification.py 文件源码 项目:linkedin_recommend 作者: duggalr2 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
edu_regex.py 文件源码 项目:linkedin_recommend 作者: duggalr2 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if 'and' == token:
                token = ''
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
    return stems
wrong.py 文件源码 项目:linkedin_recommend 作者: duggalr2 项目源码 文件源码 阅读 52 收藏 0 点赞 0 评论 0
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
helperfunctions.py 文件源码 项目:NLP 作者: Deamon5550 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def split_sentences(text):
    """
    Returns a list of the sentences in the text that is passed in.
    """
    return sent_tokenize(text)
preprocess_data.py 文件源码 项目:kaggle_redefining_cancer_treatment 作者: jorgemf 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def tokenize_documents(documents):
    for document in documents:
        text = document.text
        tokenized_doc = []
        for sent in nltk.sent_tokenize(text):
            tokenized_doc += nltk.word_tokenize(sent)
        document.text = tokenized_doc
training_data.py 文件源码 项目:vanilla-neural-nets 作者: cavaunpeu 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _tokenize_corpus_into_list_of_tokenized_sentences(cls, corpus):
        tokenized_corpus = nltk.sent_tokenize(corpus)
        tokenized_corpus = [cls._clean_sentence(sentence) for sentence in tokenized_corpus]
        return [nltk.word_tokenize(sentence) for sentence in tokenized_corpus]
keyword_extractor.py 文件源码 项目:resume-optimizer 作者: mhbuehler 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def extract(self, text, max_length=3, metric='avg', incl_scores=False):
        """Extract keywords and keyphrases from input text in descending order of score"""
        sentences = nltk.sent_tokenize(text)
        phrase_list = self._generate_candidate_keywords(sentences, max_length=max_length)
        word_scores = self._calculate_word_scores(phrase_list)
        phrase_scores = self._calculate_phrase_scores(phrase_list, word_scores, metric=metric)
        sorted_phrase_scores = sorted(phrase_scores.iteritems(), key=operator.itemgetter(1), reverse=True)
        n_phrases = len(sorted_phrase_scores)

        if incl_scores:
            return sorted_phrase_scores[0:int(n_phrases/self.top_fraction)]
        else:
            return map(lambda x: x[0], sorted_phrase_scores[0:int(n_phrases/self.top_fraction)])


问题


面经


文章

微信
公众号

扫码关注公众号