python类pos_tag()的实例源码

PreLabelWithPosPreprocessor.py 文件源码 项目:StanfordNER 作者: pandahuang 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def combine_pos_tag(self, pos_tag):
        noun = ['NN', 'NNS', 'NNP', 'NNPS']
        adjective = ['JJ', 'JJR', 'JJS']
        adverb = ['RB', 'RBR', 'RBS']
        verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
        wh = ['WDT', 'WP', 'WRB']
        if pos_tag in noun:
            return 'NN'
        elif pos_tag in adjective:
            return 'JJ'
        elif pos_tag in adverb:
            return 'RB'
        elif pos_tag in verb:
            return 'VB'
        elif pos_tag in wh:
            return 'WP'
        else:
            return pos_tag
grammar.py 文件源码 项目:alan 作者: camtaylor 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def branch(words):
  """
    This initial filter of our input sentence.
    It tokenizes the words and tags the words with parts of speech.
    It then passes the tokenized and tagged words to 1 of 3 functions.
    A sentence is either declarative() , interrogative() , or imperative()

    Args:
      words (String): The words inputted by the user
    Returns:
      String: response from one of the three functions that handle type of sentences.
  """
  parts_of_speech =  nltk.pos_tag(nltk.word_tokenize(words))
  leading_word = parts_of_speech[0][1][0]
  if leading_word == 'W':
    return interrogative(parts_of_speech[1:])
  elif leading_word == "V":
    return imperative(parts_of_speech)
  else:
    declarative(parts_of_speech)
nltkmgr.py 文件源码 项目:sia-cog 作者: deepakkumar1984 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def tokenize(data, language="english", filterStopWords = False, tagging = False):
    result = {}
    tags = []
    filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"]
    sent_token = nltk.tokenize.sent_tokenize(data, language)
    word_token = nltk.tokenize.word_tokenize(data, language)
    word_token = [w for w in word_token if not w in filterChars]
    if filterStopWords is True:
        stop_words = set(stopwords.words(language))
        word_token = [w for w in word_token if not w in stop_words]

    if tagging is True:
        tags = nltk.pos_tag(word_token)

    result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags}
    return json.loads(jsonpickle.encode(result, unpicklable=False))
tmura.py 文件源码 项目:RePhraser 作者: MissLummie 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def change_sentence(self):
        text = nltk.tokenize.word_tokenize(self._sentence)
        changed = False
        for cur in nltk.pos_tag(text):
            if (cur[1] == "NN" or cur[1] == "NNP" or cur[1] == "RPR"):
                foundedTmura = self.getFromDB(cur[0])
                if foundedTmura == None:
                    foundedTmura = getTmura(cur[0])
                    if foundedTmura != "not found":
                        self.add2DB(cur[0], foundedTmura)
                if foundedTmura != "not found" and changed == False:
                    if (foundedTmura.find("OR")):
                        foundedTmura = foundedTmura.replace('OR', 'or')

                    if randrange(2) == 0:
                        rep = cur[0] + ", " + foundedTmura + ", "
                    else:
                        rep = cur[0] + "(" + foundedTmura + ") "

                    self._sentence = self._sentence.replace(cur[0], rep)
                    changed = True
        return self._sentence
raw_analysis.py 文件源码 项目:AirbnbReviewAnalyzer 作者: mrsata 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def analysis(reviews_collection_text):
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        raw_data = f.read()
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        comments = f.readlines()
    data = raw_data.replace('\n', ' ')
    data_lower = data.lower()
    tokens_with_punc = word_tokenize(data_lower)
    tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
    print("--- Most frequent tokens ---\n",
        FreqDist(tokens_with_punc).most_common(15))
    print("--- Tokens without punctuation ---\n",
        FreqDist(tokens).most_common(15))
    stop = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop]
    print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
    tagged = pos_tag(words)
    nouns = [word for word, pos in tagged if (pos == 'NN')]
    print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
    adjts = [word for word, pos in tagged if (pos == 'JJ')]
    print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
    tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
    lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
    avgld = sum(lxdst) / len(comments)
    print("--- Average lexical density ---\n", avgld)
qa.py 文件源码 项目:NLP_question_answering_system_project 作者: Roshrini 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def whereRules(sentenceOriginal):
    score = 0
    sentence = sentenceOriginal.lower()

    # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
    #         if type(chunk) is nltk.tree.Tree:
    #             if 'LOCATION' in chunk.label() or 'GPE' in chunk.label():
    #                 score += 10

    # RULE 2
    for word in LOCPREP:
        if word in sentence:
            score += 4

    # RULE 3
    for word in LOCATION:
        if word in sentence:
            score += 6

    return score

# WHEN RULES
GitCommitBear.py 文件源码 项目:coala-bears 作者: coala 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def check_imperative(self, paragraph):
        """
        Check the given sentence/s for Imperatives.

        :param paragraph:
            The input paragraph to be tested.
        :return:
            A list of tuples having 2 elements (invalid word, parts of speech)
            or an empty list if no invalid words are found.
        """
        words = nltk.word_tokenize(nltk.sent_tokenize(paragraph)[0])
        # VBZ : Verb, 3rd person singular present, like 'adds', 'writes'
        #       etc
        # VBD : Verb, Past tense , like 'added', 'wrote' etc
        # VBG : Verb, Present participle, like 'adding', 'writing'
        word, tag = nltk.pos_tag(['I'] + words)[1:2][0]
        if(tag.startswith('VBZ') or
           tag.startswith('VBD') or
           tag.startswith('VBG') or
           word.endswith('ing')):  # Handle special case for VBG
            return (word, tag)
        else:
            return None
build_tildetalk.py 文件源码 项目:pinhook-tilde 作者: archangelic 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def word_split(self, sentence):
        words = re.split(self.word_split_pattern, sentence)
        words = [w for w in words if len(w) > 0]
        words = ["::".join(tag) for tag in nltk.pos_tag(words)]
        return words
tildetalk.py 文件源码 项目:pinhook-tilde 作者: archangelic 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def word_split(self, sentence):
        words = re.split(self.word_split_pattern, sentence)
        words = [w for w in words if len(w) > 0]
        words = ["::".join(tag) for tag in nltk.pos_tag(words)]
        return words
shakespeare.py 文件源码 项目:pinhook-tilde 作者: archangelic 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def word_split(self, sentence):
        words = re.split(self.word_split_pattern, sentence)
        words = [w for w in words if len(w) > 0]
        words = ["::".join(tag) for tag in nltk.pos_tag(words)]
        return words
postcards_chuck_norris.py 文件源码 项目:postcards 作者: abertschi 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _find_nouns(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        nouns = [word for word, pos in tagged \
                 if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]

        filter_keywords = ['chuck', 'norris', 'quot']
        filtered = [i for i in nouns if not any(f in i.lower() for f in filter_keywords)]
        return filtered
feature_construction.py 文件源码 项目:Automatic-Question-Generation 作者: bwanglzu 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def _count_token_with_match(self, answer, match):
        """Count answer match FLAG 
        """
        text = nltk.word_tokenize(answer)
        post = nltk.pos_tag(text)
        count = 0
        for k, v in post:
            if v == match:
                count += 1
        return count
reader.py 文件源码 项目:ask_data_science 作者: AngelaVC 项目源码 文件源码 阅读 47 收藏 0 点赞 0 评论 0
def is_noun(word):
    POS = nltk.pos_tag([word])[0][1]
    return POS.startswith('NN')
tweet.py 文件源码 项目:SocialNPHS 作者: SocialNPHS 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def get_tweet_tags(tweet):
    """ Break up a tweet into individual word parts """
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    # replace handles with real names
    for n, tok in enumerate(tokens):
        if tok.startswith('@'):
            handle = tok.strip("@")
            if handle in user.students:
                # If we have a database entry for the mentioned user, we can
                # easily substitute a full name.
                usr = user.NPUser(handle)
                tokens[n] = usr.fullname
            else:
                # If there is no database entry, we use the user's alias. While
                # this is the full name in many cases, it is often not reliable
                usr = api.get_user(handle)
                tokens[n] = usr.name
    tagged = nltk.pos_tag(tokens)
    # In nltk, if a teacher's name is written with a period after an
    # abbreviated prefix, it is awkwardly broken up into 3 tags
    for n, tag in enumerate(tagged):
        # If there is the weird period after the prefix,
        if tag[1] == '.':
            # and it is in fact splitting up a person's name,
            if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
                if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
                    # combine it into the actual name,
                    tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
                                                     tagged[n + 1][0]), 'NNP')
                    # and then remove the extra tags.
                    del tagged[n + 1]
                    del tagged[n]
    return tagged
xmlannotations.py 文件源码 项目:open-sesame 作者: Noahs-ARK 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def normalize_tokens(self):
        if len(self.stindices) != len(self.enindices):
            sys.stderr.write("\t\tIssue: overlapping tokenization for multiple tokens\n")
            return
        start = {}
        idx = 0
        for s in sorted(self.stindices):
            self.stindices[s] = idx
            start[idx] = s
            idx += 1
        end = {}
        idx = 0
        for t in sorted(self.enindices):
            self.enindices[t] = idx
            end[idx] = t
            if idx > 0 and end[idx - 1] > start[idx]:
                sys.stderr.write("\t\tIssue: overlapping tokenization of neighboring tokens\n")
                return
            token = self.text[start[idx] : t + 1].strip()
            if " " in token:
                sys.stderr.write("\t\tIssue: incorrect tokenization "  + token + "\n")
                return
            if token == "": continue
            self.tokens.append(token)
            idx += 1
        try:
            self.nltkpostags = [ele[1] for ele in pos_tag(self.tokens)]
            for idx in xrange(len(self.tokens)):
                tok = self.tokens[idx]
                if self.nltkpostags[idx].startswith("V"):
                    self.nltklemmas.append(lemmatizer.lemmatize(tok, pos='v'))
                else:
                    self.nltklemmas.append(lemmatizer.lemmatize(tok))
        except IndexError:
            print self.tokens
            print pos_tag(self.tokens)
        return True
pos_tagger.py 文件源码 项目:tokenquery 作者: ramtinms 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def tag(self, tokens):
        """
            add pos tags to token objects

            :param tokens: list of token objects
            :type tokens: list(Token)
            :return: label augmented list of Token objects
            :rtype: list(Token)
        """
        tags = pos_tag([token.get_text() for token in tokens])
        for token, tag in zip(tokens, tags):
            token.add_a_label('pos', tag[1])
        return tokens
BM25.py 文件源码 项目:QAServer 作者: fssqawj 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def pos(text):
    tokens = nltk.word_tokenize(text)
    wordpos = nltk.pos_tag(tokens)
    return wordpos
page-rank.py 文件源码 项目:Education-Explorer 作者: imbiswas 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def __tagPartsOfSpeech(words):
    return [pair[1] for pair in nltk.pos_tag(words)]
test_pos_taggers.py 文件源码 项目:StrepHit 作者: Wikidata 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def tag(text, tt_home):
    # Default NLTK's tokenizer
    # TreebankWordTokenizer + PunktSentenceTokenizer
    nltk_start = time()
    tokens = word_tokenize(text)
    # Default NLTK's POS tagger
    # ?
    # Use tagset='universal' for universal tagset
    nltk_tagged = pos_tag(tokens)
    nltk_end = time()
    nltk_execution = nltk_end - nltk_start
    logger.info("NLTK took %f seconds" % nltk_execution)

    # TreeTagger wrapper
    # Tokenization: ?
    # Default language: English
    # English: trained on Penn treebank
    # Default flags: -token -lemma -sgml -quiet -no-unknown
    tt_start = time()
    tt = TreeTagger(TAGDIR=tt_home)
    raw_tags = tt.tag_text(text)
    tt_end = time()
    tt_execution = tt_end - tt_start
    tt_tagged = make_tags(raw_tags)
    logger.info("TreeTagger took %f seconds" % tt_execution)
    return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
pos_tag.py 文件源码 项目:StrepHit 作者: Wikidata 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def tag_one(self, text, tagset, **kwargs):
        """ POS-Tags the given text """
        return pos_tag(word_tokenize(text, tagset))


问题


面经


文章

微信
公众号

扫码关注公众号