python类WordNetLemmatizer()的实例源码

pytc.py 文件源码 项目:coarse-fine_emotion_classification 作者: hehuihui1994 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def word_lemma(doc_unis_list):
    wnl = WordNetLemmatizer()
    doc_stems_list = []
    for doc_unis in doc_unis_list:
        doc_stems = []
        for uni in doc_unis:
            stem_uni = wnl.lemmatize(uni)
            doc_stems.append(stem_uni)
        doc_stems_list.append(doc_stems)
    return doc_stems_list


########## Text Statistic Fuctions ##########
PreSignature.py 文件源码 项目:PPRE 作者: MaoYuwei 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def POStagging(self):
        #?????????????????????????
        fin = open('../file/entity_signature.txt', 'r')
        fout = open('../file/pos_signature.txt', 'w+')
        lemmatizer = WordNetLemmatizer()
        j = 0#????????????????????
        num = 0
        while True:
            line = fin.readline()
            if line:
                if '***' in line:
                    #print j, num
                    fout.write(line)
                    pro_num, pro = line.split('.')
                    pro, num = pro.split()
                    pro1, pro2 = pro.split('***')
                    j = 0#???????????
                elif '------' in line:
                    fout.write(line)
                else:

                    # split text into tokens
                    #??
                    num, line = line.split(':', 1)
                    fout.write(num + ':')
                    text_tokens = nltk.word_tokenize(line)

                    t = 0
                    # tag the sentence, using the default NTLK English tagger
                    # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
                    sentence_tag = nltk.pos_tag(text_tokens)
                    for i in range(len(sentence_tag)):
                        word = sentence_tag[i][0]
                        tag = sentence_tag[i][1]
                        if word == 'Entity1':
                            fout.write('#' + pro1 + '# ')
                        elif word == 'Entity2':
                            fout.write('#' + pro2 + '# ')
                        else:
                            if (re.match('(V|N)', tag)) and (not re.match('(NNP)', tag)):
                            #if re.match('(V|N)', tag):
                            #if re.match('V', tag):

                                word = lemmatizer.lemmatize(word)
                                t = t + 1
                                fout.write(word + ' ')
                    fout.write('\n')
                    if t > 0:
                        j = j + 1
            else:
                break
        fin.close()
        fout.close()
lemmatizer_3.py 文件源码 项目:adaware-nlp 作者: mhw32 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def gen_dataset(sentences,
                max_words=78,
                train_test_split=True):
    ''' Generate a dataset of (input, output) pairs where the
        input is an embedded vector and output the category (one-hotted)

        Args
        ----
        sentences : list
                    list of sentences where each sentence is list of tokens
        max_words : integer
                    maximum number of words allowed in sentence
        train_test_split : boolean
                           whether to split data into 2 sets
    '''

    num_sentences = len(sentences)
    model = models.Word2Vec.load_word2vec_format(
        '../storage/pos_tagger/GoogleNews-vectors-negative300.bin',
        binary=True)
    vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatizer = lambda x:  wordnet_lemmatizer.lemmatize(x)

    X = np.zeros((num_sentences, max_words, 300))
    y = np.zeros((num_sentences, max_words, 300))
    K = np.zeros(num_sentences)
    I = np.arange(num_sentences)

    param_dict = {}
    param_dict['max_words'] = max_words

    for sent_i in I:
        words = sentences[sent_i]

        if sent_i % 1000 == 0:
            print("{} sentences parsed. {} remaining.".format(
                sent_i, num_sentences - sent_i - 1))

        X[sent_i, :, :], y[sent_i, :, :] = \
            prepare_sentence(words, vectorizer=vectorizer,
                                    lemmatizer=lemmatizer,
                                    max_words=max_words)

        K[sent_i] = len(words)  # keep track of num words in sentence

    if train_test_split:
        (X_train, X_test), (I_train, I_test) = util.split_data(
            X, out_data=I, frac=0.80)
        y_train, y_test = y[I_train], y[I_test]
        K_train, K_test = K[I_train], K[I_test]

        return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
    return (X, y, K), param_dict
lemmatizer.py 文件源码 项目:adaware-nlp 作者: mhw32 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def gen_dataset(sentences,
                max_words=78,
                train_test_split=True):
    ''' Generate a dataset of (input, output) pairs where the
        input is an embedded vector and output is
        an embedded vector for the lemmatized form.

        Args
        ----
        sentences : list
                    list of sentences where each sentence is list of tokens
        max_words : integer
                    maximum number of words allowed in sentence
        train_test_split : boolean
                           whether to split data into 2 sets
    '''

    num_sentences = len(sentences)
    model = models.Word2Vec.load_word2vec_format(
        '../storage/pos_tagger/GoogleNews-vectors-negative300.bin',
        binary=True)
    vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON
    lemmatizer = WordNetLemmatizer().lemmatize

    X = np.zeros((num_sentences, max_words, 300))
    y = np.zeros((num_sentences, max_words, 300))
    K = np.zeros(num_sentences)
    I = np.arange(num_sentences)

    param_dict = {}
    param_dict['max_words'] = max_words

    for sent_i, words in enumerate(sentences):
        if sent_i % 1000 == 0:
            print("{} sentences parsed. {} remaining.".format(
                sent_i, num_sentences - sent_i - 1))

        X[sent_i, :, :], y[sent_i, :, :] = \
            prepare_sentence(words,
                             vectorizer=vectorizer,
                             lemmatizer=lemmatizer,
                             max_words=max_words)

        K[sent_i] = len(words)  # keep track of num words in sentence

    if train_test_split:
        (X_train, X_test), (I_train, I_test) = split_data(
            X, out_data=I, frac=0.80)
        y_train, y_test = y[I_train], y[I_test]
        K_train, K_test = K[I_train], K[I_test]

        return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
    return (X, y, K), param_dict


问题


面经


文章

微信
公众号

扫码关注公众号