python类stem()的实例源码-第2页-面圈网

process_both.py 文件源码项目：disaster-mitigation 作者: varun-manjunath 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def get_list_1(need_tweet_list):
    need_res_set=[]
    for i in need_tweet_list:
        for j in i.split():
            if stemmer.stem(j.lower()) not in out_stem_list:
                need_res_set.append(j.lower())
    return list(set(need_res_set))

process_both.py 文件源码项目：disaster-mitigation 作者: varun-manjunath 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def get_list_2(need_tweet_list):
    need_res_set=[]
    for i in need_tweet_list:
        for j in i.split():
            if stem2.stem(j.lower()) not in lanc_stem_list:
                need_res_set.append(j.lower())
    return list(set(need_res_set))

process_both.py 文件源码项目：disaster-mitigation 作者: varun-manjunath 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def get_set_1(need_tweet_list):
    need_res_set=set()
    for i in need_tweet_list:
        for j in i.split():
            if stemmer.stem(j.lower()) not in out_stem_list:
                need_res_set.add(stemmer.stem(j.lower()))
    return need_res_set

process_both.py 文件源码项目：disaster-mitigation 作者: varun-manjunath 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def resource_similarity_score_via_exact_word_match_1(need_res_set,offer_tweet_list):
    if len(need_res_set)==0:
        return 0

    offer_res_set=set()
    for i in offer_tweet_list:
        for j in i.split():
            if j not in out_stem_list:
                offer_res_set.add(stemmer.stem(j.lower()))

    return(len(offer_res_set&need_res_set)/len(need_res_set))

tagger.py 文件源码项目：teem-tag 作者: P2Pvalue 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __eq__(self, other):
        return self.stem == other.stem

tagger.py 文件源码项目：teem-tag 作者: P2Pvalue 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __hash__(self):
        return hash(self.stem)

tagger.py 文件源码项目：teem-tag 作者: P2Pvalue 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self, stemmer=None):
        '''
        @param stemmer: an object or module with a 'stem' method (defaults to
                        stemming.porter2)

        @returns: a new L{Stemmer} object
        '''

        if not stemmer:
            from nltk.stem import PorterStemmer
            stemmer = PorterStemmer()
        self.stemmer = stemmer

tagger.py 文件源码项目：teem-tag 作者: P2Pvalue 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def rate_tags(self, tags):
        '''
        @param tags: a list of tags to be assigned a rating
        '''

        term_count = collections.Counter(tags)

        for t in tags:
            # rating of a single tag is term frequency * weight
            t.rating = term_count[t] / len(tags) * self.weights.get(t.stem, 1.0)

sentiment_module.py 文件源码项目：Twitter-Sentiment 作者: igorbpf 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def review_to_words(review):

    if isinstance(review, float):
        review = str(review).encode("utf-8")
    letters_only = re.sub("\W+", " ", review, flags=re.UNICODE)

    words = letters_only.lower().split()
    #nltk.data.path.append('./nltk_data/')
    #stops = set(nltk.corpus.stopwords.words("portuguese"))
    meaningful_words = words #[w for w in words if not w in stops]
    #stemmer = RSLPStemmer()
    meaningful_stemmed = meaningful_words #[stemmer.stem(w) for w in meaningful_words]
    return(" ".join(meaningful_stemmed))

Chapter 05_KNN n Naive Bayes.py 文件源码项目：Statistics-for-Machine-Learning 作者: PacktPublishing 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]

    tokens = [word.lower() for word in tokens]

    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]

    tokens = [word for word in tokens if len(word)>=3]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    

    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')

    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text

graph_of_words.py 文件源码项目：TextAsGraphClassification 作者: NightmareNyx 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
    if stopwords is not None:
        terms = [t for t in terms if t not in stopwords]
    if only_N_J is not None:  # include only nouns and verbs
        tagged = nltk.pos_tag(terms)
        terms = [t for t, pos in tagged if pos in tags]
    if lemmatize is not None:
        lem = WordNetLemmatizer()
        terms = [lem.lemmatize(t) for t in terms]
    if stem is not None:
        stem = PorterStemmer()
        terms = [stem.stem(t) for t in terms]
    return terms

graph_of_words.py 文件源码项目：TextAsGraphClassification 作者: NightmareNyx 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def extract_terms_from_file(file_location, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
    with open(file_location, 'r', encoding='iso-8859-1') as doc:
        terms = []
        for line in doc:
            terms.extend(re.compile('\w+').findall(line.lower()))

        # terms = re.compile('\w+').findall(doc
        #                                   .read()
        #                                   .replace('\n', '')
        #                                   .lower())
        return clean_terms(terms, stopwords, lemmatize, stem, only_N_J)

graph_of_words.py 文件源码项目：TextAsGraphClassification 作者: NightmareNyx 项目源码文件源码阅读 97 收藏 0 点赞 0 评论 0

def extract_terms_from_sentence(sentence, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
    terms = re.compile('\w+').findall(sentence.lower())
    return clean_terms(terms, stopwords, lemmatize, stem, only_N_J)

qa.py 文件源码项目：NLP_question_answering_system_project 作者: Roshrini 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def addToSentenceScore(question, sentence):

    score = 0

    questionSet = set()
    for item in question.split():
        questionSet.add(morpher.stem(item.replace("?","")))

    sentenceSet = set()
    for item in sentence.split():
        sentenceSet.add(morpher.stem(item.replace("?","")))

    jaccard = float(len(questionSet.intersection(sentenceSet))) / float(len(questionSet.union(sentenceSet)))

    common = ' '.join(sentenceSet.intersection(questionSet))
    tagCommon = nltk.pos_tag(nltk.word_tokenize(common))
    if tagCommon:
        for item in tagCommon:
            if 'VB' in item[1]:
                score += 6
            else:
                score += 3

    # Add sentence and score to a hashmap
    sentenceScore[sentence] = score + (jaccard * 10)
    return score

# PARSER TO TOKENIZE, REMOVE STOP WORDS, MORPHOLOGY, ADD TO SET

qa.py 文件源码项目：NLP_question_answering_system_project 作者: Roshrini 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def parser(line):
    tokLine = nltk.word_tokenize(line)
    keywords = list(set(tokLine) - set(stopwords))
    lineSet = set()
    for item in keywords:
        lineSet.add(morpher.stem(item.replace("?", "")))
    return lineSet


# WORD MATCH

qa.py 文件源码项目：NLP_question_answering_system_project 作者: Roshrini 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def worMatch(question, sentence):

    score = 0

    questionSet = set()
    for item in question.split():
        questionSet.add(morpher.stem(item.replace("?","")))

    sentenceSet = set()
    for item in sentence.split():
        sentenceSet.add(morpher.stem(item.replace("?","")))

    jaccard = float(len(questionSet.intersection(sentenceSet))) / float(len(questionSet.union(sentenceSet)))

    common = ' '.join(sentenceSet & questionSet)
    tagCommon = nltk.pos_tag(nltk.word_tokenize(common))
    if tagCommon:
        for item in tagCommon:
            if 'VB' in item[1]:
                score += 6
            else:
                score += 3

    return score + (jaccard * 10)



# GET INPUT FILE NAME

__NLPMODIFIED.py 文件源码项目：PYSHA 作者: shafaypro 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def steam_words(self, word):
        ps_obj = PorterStemmer()  # creating the port steamer
        steamed_word = ps_obj.stem(word)
        return steamed_word  # returns the steamed word to the main file .

    # Natural Language displaying setneces .

Between.py 文件源码项目：PPRE 作者: MaoYuwei 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def CleanReVerb(self):
        fin_seed = open('../file/seed_ReVerb.txt', 'r')
        fout_seed = open('../file/seed_ReVerb_clean.txt', 'w+')
        fin_signature = open('../file/signature_ReVerb.txt', 'r')
        fout_signature = open('../file/signature_ReVerb_clean.txt', 'w+')
        while True:
            line = fin_seed.readline()
            if line:
                if '***' in line:
                    fout_seed.write(line)
                else:
                    mark, line = line.split(':', 1)
                    line = self.CleanStopWords(line)#?????
                    #????
                    line = line.split()
                    word_list = []
                    s = nltk.stem.SnowballStemmer('english')
                    for w in line:
                        w = s.stem(w)
                        word_list.append(w)
                    if len(word_list) > 0:
                        line = ' '.join(word_list)
                        fout_seed.write(mark + ':' + line + '\n')
            else:
                break
        while True:
            line = fin_signature.readline()
            if line:
                if '***' in line:
                    fout_signature.write(line)
                else:
                    mark, line = line.split(':', 1)
                    line = self.CleanStopWords(line)#?????
                    #????
                    line = line.split()
                    word_list = []
                    s = nltk.stem.SnowballStemmer('english')
                    for w in line:
                        w = s.stem(w)
                        word_list.append(w)
                    if len(word_list) > 0:
                        line = ' '.join(word_list)
                        fout_signature.write(mark + ':' + line + '\n')
            else:
                break
        fin_signature.close()
        fout_signature.close()

tagger.py 文件源码项目：teem-tag 作者: P2Pvalue 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def __call__(self, tags):
        '''
        @param tags: a list of (preferably stemmed) tags

        @returns: a list of unique (multi)tags sorted by relevance
        '''
        # print tags
        self.rate_tags(tags)
        multitags = self.create_multitags(tags)

        # keep most frequent version of each tag
        clusters = collections.defaultdict(collections.Counter)
        proper = collections.defaultdict(int)
        ratings = collections.defaultdict(float)

        for t in multitags:
            clusters[t][t.string] += 1
            if t.proper:
                proper[t] += 1
                ratings[t] = max(ratings[t], t.rating)


        term_count = collections.Counter(multitags)

        for t, cnt in term_count.iteritems():
            t.string = clusters[t].most_common(1)[0][0]
            proper_freq = proper[t] / cnt
            if proper_freq >= 0.5:
                t.proper = True
                t.rating = ratings[t]

        # purge duplicates, one-character tags and stopwords
        unique_tags = set(t for t in term_count
                          if len(t.string) > 1 and t.rating > 0.0)
        # remove redundant tags
        for t, cnt in term_count.iteritems():
            words = t.stem.split()
            for l in xrange(1, len(words)):
                for i in xrange(len(words) - l + 1):
                    s = Tag(' '.join(words[i:i + l]))
                    relative_freq = cnt / term_count[s]
                    if ((relative_freq == 1.0 and t.proper) or
                        (relative_freq >= 0.5 and t.rating > 0.0)):
                        unique_tags.discard(s)
                    else:
                        unique_tags.discard(t)

        # print unique_tags
        return sorted(unique_tags)

graph_of_words.py 文件源码项目：TextAsGraphClassification 作者: NightmareNyx 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def docs_to_networkx(dataset, cats, window_size=2, vocabulary_creation=True):
    ds = './datasets/%s/' % dataset
    Gs = []
    labels = []
    type_ = 2
    vocab_creation = vocabulary_creation
    words = []  # for vocabulary

    for doc in os.listdir(ds):
        if 'train.txt' in doc:
            type_ = 1

    if type_ == 1:
        if os.path.exists("ds/vocab.txt"):
            vocab_creation = False
        with open(ds + '/train.txt', 'r', encoding='iso-8859-1') as doc:
            dc = 1
            for line in doc:
                label = line[0]
                labels.append(label)
                terms = extract_terms_from_sentence(line[1:],
                                                    stopwords=stopwords.words('english'),
                                                    lemmatize=True,
                                                    stem=True,
                                                    only_N_J=True)
                if vocab_creation:
                    words.extend(terms)
                graph = terms_to_graph(terms, window_size)
                G = graph_to_networkx(graph, name=label + '_' + str(dc))
                # G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
                nx.set_node_attributes(G, 'label', dict(zip(G.nodes(), G.nodes())))
                Gs.append(G)
                dc += 1
    else:
        if os.path.exists("ds/vocab.txt"):
            vocab_creation = False
        for cat in cats.keys():
            for doc in os.listdir(ds + cat):
                terms = extract_terms_from_file(ds + cat + '/' + doc,
                                                stopwords=stopwords.words('english'),
                                                lemmatize=True,
                                                stem=True,
                                                only_N_J=True)
                if vocab_creation:
                    words.extend(terms)
                graph = terms_to_graph(terms, window_size)
                G = graph_to_networkx(graph, name=cat + doc.split('.')[0])
                # G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
                nx.set_node_attributes(G, name='label', values=dict(zip(G.nodes(), G.nodes())))
                Gs.append(G)
                labels.append(cats[cat])

    if vocab_creation:
        vocab = dict(Counter(words))
        create_vocabulary_file(fname, vocab)

    return Gs, labels


# needs fix or discard