python类pos_tag()的实例源码

main.py 文件源码 项目:That-s-Fake 作者: rajeevdesai 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def ne_tagging(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk
tasks.py 文件源码 项目:QProb 作者: quant-trade 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def keyword_extractor(data):
    try:
        #np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data)))
        #result = np_extractor.extract()
        text = words_wo_stopwords(strip_tags(data))

        #TODO this is duplicated job, should be improved
        words = word_tokenize(strip_tags(text))
        taggged = pos_tag(words)
        cleaned = filter_insignificant(taggged)
        text = " ".join(cleaned)
        wc = WordCloudMod().generate(text)
        result = list(wc.keys())[:10]
    except Exception as err:
        print(colored.red("At keywords extraction {}".format(err)))
        result = []

    return result


# TODO definitely can be better if we knew where content is
feature_construction.py 文件源码 项目:Automatic-Question-Generation 作者: bwanglzu 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def _identify_pronoun(self, answer):
        """Calculate percentage of pronouns within answer
        - Args:
            answer(str): answer text
        - Returns:
            percentage(float): ratio of pronouns in answer
        """
        text = nltk.word_tokenize(answer)
        post = nltk.pos_tag(text)
        pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
        # init variables
        num_pronouns = 0
        num_terms = len(post)
        percentage = 0
        for k, v in post:
            if v in pronoun_list:
                num_pronouns += 1
        percentage = float(num_pronouns) / num_terms
        return percentage
feature_construction.py 文件源码 项目:Automatic-Question-Generation 作者: bwanglzu 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def _identify_pronoun2(self, sentence):
        """Calculate percentage of pronouns in the sentence that are in the answer
        - Args:
            sentence(str): question sentence 
        - Returns:
            pronoun_in_sentence(list): pronouns in sentence 
            sentence_len(int): length of current sentence 
        """
        text = nltk.word_tokenize(sentence)
        post = nltk.pos_tag(text)
        pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
        pronoun_in_sentence = []
        sentence_len = len(post)
        for k, v in post:
            if v in pronoun_list:
                pronoun_in_sentence.append(k)
        return pronoun_in_sentence, sentence_len
feature_construction.py 文件源码 项目:Automatic-Question-Generation 作者: bwanglzu 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def _first_tagger_after_answer_span(self, question):
        """Get the first tagger after answer span
        - Args:
            question(string): string of current question 
        - Returns:
            tagger(string): tagger of first term after span
        """
        index = 0
        text = nltk.word_tokenize(question)
        post = nltk.pos_tag(text)
        for idx, t in enumerate(post):
            if t[0] == '_____':
                index = idx + 1
                break
        try:
            return post[index][1]
        except IndexError:
            return 'dummy'
feature_construction.py 文件源码 项目:Automatic-Question-Generation 作者: bwanglzu 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def _first_tagger_before_answer_span(self, question):
        """Get the first tagger before answer span
        - Args:
            question(string): string of current question 
        - Returns:
            tagger(string): tagger of first term before span
        """
        index = 0
        text = nltk.word_tokenize(question)
        post = nltk.pos_tag(text)
        for idx, t in enumerate(post):
            if t[0] == "_____":
                index = idx - 1
                break
        try:
            return post[index][1]
        except IndexError:
            return 'dummy'
algorithm.py 文件源码 项目:wntf 作者: tonybaloney 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def tag(self, lines):
        '''
        Tokenize and categorise the words in the collection of
        text

        :param lines: The list of strings with the text to match
        :type  lines: ``list`` of ``str``

        :rtype: :class:
        :return:
        '''
        try:
            tokenized_words = nltk.word_tokenize(lines)
            return nltk.pos_tag(tokenized_words)
        except LookupError as le:
            print("Run install_words.py first")
            raise le
phrase-extraction.py 文件源码 项目:PyRATA 作者: nicolashernandez 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def brown_data():
  """return the text_length first tokens of the brown corpus tagged in pyrata format"""
  tokens = brown.words()
  tokens = tokens[:text_length]

  pos_tags = nltk.pos_tag(tokens)

  return [{'raw':w, 'pos':p} for (w, p) in pos_tags]


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST 
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
pos_tag.py 文件源码 项目:StrepHit 作者: Wikidata 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def tag_one(self, text, skip_unknown=True, **kwargs):
        """ POS-Tags the given text, optionally skipping unknown lemmas

            :param unicode text: Text to be tagged
            :param bool skip_unknown: Automatically emove unrecognized tags from the result

            Sample usage:

            >>> from strephit.commons.pos_tag import TTPosTagger
            >>> from pprint import pprint
            >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj'))
            [Tag(word=u'sample', pos=u'NN', lemma=u'sample'),
             Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'),
             Tag(word=u'to', pos=u'TO', lemma=u'to'),
             Tag(word=u'be', pos=u'VB', lemma=u'be'),
             Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')]
        """
        return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)),
                                      skip_unknown)
translate.py 文件源码 项目:DogeGen 作者: MemeTrash 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _get_base_doge_words(self, eng_text):
        """
        Get all base words from text to make doge phrases from.
        eg. 'Hello there, I am happy' -> ['hello', 'are', 'happy']

        Args:
            eng_text (str): Text to get words from.

        Returns:
            list[str]: List of lower case words to use from text.
        """
        phrase_no_punct = "".join([ch for ch in eng_text if ch not in string.punctuation])
        tagged_words = nltk.pos_tag([w.lower() for w in phrase_no_punct.split(' ') if w.isalpha()])
        chosen_words = []
        for word, tag in tagged_words:
            if tag[0] in ['N', 'V', 'J']:
                # make noun singular
                if tag[0] == 'N':
                    word = self._lemmatizer.lemmatize(word, pos='n')
                # make verb infinitive
                elif tag[0] == 'V':
                    word = self._lemmatizer.lemmatize(word, pos='v')
                chosen_words.append(word.encode('ascii', 'ignore'))  # lemmatize makes word unicode
        return list(set(chosen_words))
translate.py 文件源码 项目:DogeGen 作者: MemeTrash 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _get_doge_descriptors(self, word_ls):
        """
        Get descriptors for a set of doge words.
        eg. ['person', 'run'] -> ['much', 'very']

        Args:
            word_ls (list[str]): List of words to use.

        Returns:
            list[str]: List of doge descriptors, eg. 'much', 'very', in order.
        """
        tagged_words = nltk.pos_tag(word_ls)
        chosen_descriptors = []
        for word, tag in tagged_words:
            possible_descs = [MUCH, MANY, SUCH, SO, VERY]
            if tag[0] == 'J':
                possible_descs.remove(VERY)
                possible_descs.remove(SO)
            if len(chosen_descriptors) >= 2:
                allowed_descriptors = [s for s in possible_descs if s not in chosen_descriptors[-2:]]
            else:
                allowed_descriptors = [s for s in possible_descs if s not in chosen_descriptors]
            chosen_descriptors.append(random.choice(allowed_descriptors))
        return chosen_descriptors
keyphrase.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def extract_candidate_words(sents, tags=GOODTAGS, tagged=False, **kwargs):
    """
    Extracts key words based on a list of good part of speech tags.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Identify only good words by their tag
        for token, tag in sent:
            if tag in tags:
                for token in normalizer.normalize([token]):
                    yield token


##########################################################################
## Key phrase by text scoring mechanisms
##########################################################################
normalize.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def normalize(self, words):
        """
        Normalizes a list of words.
        """
        # Add part of speech tags to the words
        words = nltk.pos_tag(words)

        for word, tag in words:
            if self.lower: word = word.lower()
            if self.strip: word = word.strip()

            if word not in self.stopwords:
                if not all(c in self.punct for c in word):
                    if self.lemmatize:
                        word = self.lemmatizer.lemmatize(word, tag)

                    yield word
lanprosForVCF.py 文件源码 项目:PhenVar 作者: NCBI-Hackathons 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def tagged_abstracts(RS_pmids_tokenizedabstracts_dict):

    """ Takes a dict of tokenized abstracts
    and tags them using the NLTK module for Natural Language Entities.
    Input dictionary: key is the RS ID, value is a dictionary where key is the pmid and value is a list of tokens"""
    RS_pmids_taggedabstracts_dict = {}
    for each_RS in RS_pmids_tokenizedabstracts_dict:
        pmids_taggedabstracts = {}
        pmids_tokenizedabstracts = RS_pmids_tokenizedabstracts_dict[each_RS]
        for pmid in pmids_tokenizedabstracts:
            taggedabstracts_list = []
            for token in pmids_tokenizedabstracts[pmid]:
                tagged = nltk.pos_tag(token)
                taggedabstracts_list.append(tagged)
            pmids_taggedabstracts[pmid] = taggedabstracts_list
        RS_pmids_taggedabstracts_dict[each_RS] = pmids_taggedabstracts
    return RS_pmids_taggedabstracts_dict
dependencygraph.py 文件源码 项目:one-day-with-cling 作者: mariana-scorp 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def from_sentence(sent):
        tokens = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(tokens)

        dg = DependencyGraph()
        for (index, (word, tag)) in enumerate(tagged):
            dg.nodes[index + 1] = {
                'word': word,
                'lemma': '_',
                'ctag': tag,
                'tag': tag,
                'feats': '_',
                'rel': '_',
                'deps': defaultdict(),
                'head': '_',
                'address': index + 1,
            }
        dg.connect_graph()

        return dg
lemmatizer.py 文件源码 项目:adaware-nlp 作者: mhw32 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def prepare_sentence(words,
                     vectorizer=None,
                     lemmatizer=None,
                     max_words=78,
                     return_output=True):
    X = np.ones((max_words, 300))*ZERO_EPSILON
    if return_output:
        y = np.ones((max_words, 300))*ZERO_EPSILON
        raw_pos = [p[1]for p in pos_tag(words)]
        pos     = [str(treebank_to_simple(p, default=wordnet.NOUN)) for p in raw_pos]
        lemmas  = [str(lemmatizer(w, pos=p)) for (w,p) in zip(words, pos)]

    num_words = len(words) if len(words) <= max_words else max_words

    for word_i in range(num_words):
        word_vector = vectorizer(words[word_i])
        X[word_i, :] = word_vector

        if return_output:
            lemma_vector = lemmas[word_i]
            y[word_i, :] = vectorizer(lemma_vector)

    if return_output:
        return X, y
    return X
candidates.py 文件源码 项目:atap 作者: foxbook 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False):

    # Create the chunker that uses our grammar
    chunker = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.word_tokenize(sent))

        # Parse the sentence, converting the parse tree into a tagged sequence
        sent = normalize(sent)
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract phrases and rejoin them with space
        phrases = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda term: term[-1] != 'O'
            ) if key
        ]

        for phrase in phrases:
            yield phrase
shakespeare.py 文件源码 项目:PoemGenerator 作者: eugenet12 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_counts():
    global unigrams
    global bigrams
    global sentences

    for i in xrange(1, NUM_FILES+1):
        if i in SKIP:
            continue
        with open("Shakespeare_parsed/%03d" % i) as f:
            for line in f:
                tokens = get_tokens(line)
                tokens = [t.lower() for t in tokens]
                tags = nltk.pos_tag(tokens)
                if len(tokens) == 0:
                    continue
                sentences.append(tokens)
                prev_word = ""
                for token in tokens:
                    unigrams[token] += 1
                    if not prev_word == "":
                        bigrams[(prev_word,token)] += 1
                    prev_word = token

    top10_uni = unigrams.most_common()[:10]
    top10_bi = bigrams.most_common()[:10]
contexts.py 文件源码 项目:KDDCUP2016 作者: hugochan 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def tag_contexts(doc_id):

    global tags
    if not tags :
        tags = nltk.data.load("help/tagsets/upenn_tagset.pickle")

    words = defaultdict(Counter)
    count = Counter()
    for context in get_contexts(doc_id) :
        for word, tag in nltk.pos_tag(tokenize(context)) :
            words[tag].update([word])

            count.update([tag])


    tag_common_words = {tag : ' '.join(zip(*tag_words.most_common(10))[0]) for tag, tag_words in words.items() }

    for tag, freq in count.most_common(15) :
        print "%4d\t%45s\t%s" % (freq, tags[tag][0], tag_common_words[tag])
reddit_NN_entities.py 文件源码 项目:Hanhan_Play_With_Social_Media 作者: hanhanwu 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def get_NN_entities(post):
    sentences = nltk.tokenize.sent_tokenize(post)
    token_sets = [nltk.tokenize.word_tokenize(s) for s in sentences]
    pos_tagged_token_sets = [nltk.pos_tag(t) for t in token_sets]
    pos_tagged_tokens = [t for v in pos_tagged_token_sets for t in v]

    all_entities = []
    previous_pos = None
    current_entities = []
    for (entity, pos) in pos_tagged_tokens:
        if previous_pos == pos and pos.startswith('NN'):
            current_entities.append(entity.lower())
        elif pos.startswith('NN'):
            if current_entities != []:
                all_entities.append(' '.join(current_entities))
            current_entities = [entity.lower()]
        previous_pos = pos
    return all_entities
NLTKPreprocessor.py 文件源码 项目:ai-chatbot-framework 作者: alfredfrancis 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                # if token in self.stopwords:
                #     continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma
pre_process.py 文件源码 项目:jenova 作者: dungba88 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def clean_text(raw_text, filtered_word_types):
    """Clean raw text for bag-of-words model"""
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)

    # Convert to lower case, split into individual words
    words = letters_only.lower().split()

    # stem words
    stemmer = PorterStemmer()
    stemmed_words = list(map(stemmer.stem, words))

    # Remove stop words if requested
    if filtered_word_types is not None:
        tagged_text = nltk.pos_tag(stemmed_words)
        stemmed_words = [w for w, wtype in tagged_text if not wtype in filtered_word_types]

    # join together
    return " ".join(stemmed_words)
data_helpers.py 文件源码 项目:Question-Answering-NNs 作者: nbogdan 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def get_lemmas(sent, lemmatizer):
    stop_words = []
    res = []
    for word in sent:
        pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
        if pos == '':
            lemma = lemmatizer.lemmatize(word)
        else:
            lemma = lemmatizer.lemmatize(word, pos)
        #if(type(lemma) == unicode):
        #    lemma = lemma.encode('ascii', 'ignore')

        if lemma.isdigit():
            res.append('number')
        else:
            res.append(lemma)
    return res
qstn_classifier_trainer.py 文件源码 项目:chitti 作者: bhuvi8 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def pos_tag_questions(qstn_list):
    res = []
    count = 0 
    for i in qstn_list:
        r = []
        i = i.split(':')
        r.append(i[0])
        r.append(i[1].split()[0])
        i = i[1].split()
        del i[0]
        sent = nltk.word_tokenize(' '.join(i))
        r.append(nltk.pos_tag(sent))
        res.append(tuple(r))
        count += 1
        if (count % 100) == 0:
            print ("processed : " + str(count) )
    return res

#experiment with different features to get better accuracy
#also dont forget to to include the same feature extractor in the process_grammar.py
__init__.py 文件源码 项目:rss_skill 作者: forslund 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def __init__(self):
        super(RssSkill, self).__init__('RssSkill')
        self._is_reading_headlines = False
        self.feeds = {}
        self.cached_items = {}
        self.cache_time = {}
        try:
            pos_tag('advance')
        except LookupError:
            logger.debug('Tagger not installed... Trying to download')
            dler = Downloader()
            if not dler.download('averaged_perceptron_tagger'):
                logger.debug('Trying alternative source...')
                dler = Downloader(ALT_NLTK_DATA)
                dler.download('averaged_perceptron_tagger',
                         raise_on_error=True)
generate_stem_pos_tag.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def pos_tag_text(line,
                 token_pattern=token_pattern,
                 exclude_stopword=stopwords,
                 encode_digit=False):
    token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
    for name in ["question1", "question2"]:
        l = line[name]
        ## tokenize
        tokens = [x.lower() for x in token_pattern.findall(l)]
        ## stem
        #tokens=l.lower().split()
        #print tokens
        tokens = stem_tokens(tokens, english_stemmer)
        line[name+'_stem']=' '.join(tokens)
        #print tokens
        if exclude_stopword:
            tokens = [x for x in tokens if x not in stopwords]
        tags = pos_tag(tokens)
        tags_list = [t for w,t in tags]
        tags_str = " ".join(tags_list)
        #print tags_str
        line[name+'_pos_tag'] = tags_str
    return line[[ u'question1_stem', u'question1_pos_tag', u'question2_stem',
       u'question2_pos_tag']]
generate_neighbor_pos.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_pos_tag(qind):
    q = index_q[qind]
    wl = str(q).lower().split()
    pos_l = nltk.pos_tag(wl)
    q1_pos = []
    for pos in pos_l:
        q1_pos.append(pos[1])
    return q1_pos

# def get_ner_tag(qind):
#     q = index_q[qind]
#     wl = str(q).lower().split()
#     ner_l = nltk.ne_chunk(wl)
#     q1_ner = []
#     for pos in ner_l:
#         q1_ner.append(pos[1])
#     return q1_ner
generate_ngram_pos_link.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def getPOSLinks(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    text = nltk.word_tokenize(text)
    pos = nltk.pos_tag(text)
    links = []
    link = []
    active = False
    for w in pos:
        part = w[1]
        word = w[0]
        if(not active and (part[:2] == "DT" or part == "WP" or part == "VB" or part == "IN")):
            active = True
        if(active):
            link.append(wordnet_lemmatizer.lemmatize(word))
        #extract main body
        if(active and (part == "PRP" or part[:2] == "NN" or part == "." )):
            active = False
            links.append(" ".join(link))
            link = []
    return links
tag.py 文件源码 项目:twitter-trends-summarizer 作者: yuva29 项目源码 文件源码 阅读 45 收藏 0 点赞 0 评论 0
def tag(path, filename):
    print("Tagging "+path)
    WRITE_HANDLER = open(PREPROCESSED_DATA + filename.strip() + "_features", 'w')
    for line in open(path, 'r'):    
        tokens = line.split()
        if(len(tokens) == 0):
            continue
        tags = pos_tag(tokens) # tag

        features = list()
        for token in tags:
            tok = token[0]
            tag = token[1]
            if tok.lower() not in stop_words:
                features.append(tok+":"+tag)                
        if(len(features)>0):
            WRITE_HANDLER.write(str(features)+'\n\n')
        else: ## EMPTY lines
            WRITE_HANDLER.write('\n\n')
answer_engine.py 文件源码 项目:Question-Answering-System 作者: AdityaAS 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def _analyze_query(self):
        tagged = nltk.pos_tag(self.ir_query)
        ir_query_tagged = []
        for word, pos in tagged:
            pos = {
                pos.startswith('N'): wordnet.NOUN,
                pos.startswith('V'): wordnet.VERB,
                pos.startswith('J'): wordnet.ADJ,
                pos.startswith('R'): wordnet.ADV,
                }.get(pos, None)
            if pos:
                synsets = wordnet.synsets(word, pos=pos)
            else:
                synsets = wordnet.synsets(word)
            ir_query_tagged.append((word, synsets))

        # Add additional special hidden term
        ir_query_tagged.append(('cause', [wordnet.synset('cause.v.01')]))
        self.ir_query_tagged = ir_query_tagged


问题


面经


文章

微信
公众号

扫码关注公众号