python类PorterStemmer()的实例源码-面圈网

preprocessing.py 文件源码项目：KATE 作者: hugochan 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]

base.py 文件源码项目：FreeDiscovery 作者: FreeDiscovery 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def select_top_words(word_list, n=10):
    """ Filter out cluster term names"""
    import re
    from nltk.stem.porter import PorterStemmer
    from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
    st = PorterStemmer()
    out_st = []
    out = []
    for word in word_list:
        word_st = st.stem(word)
        if len(word_st) <= 2 or\
                re.match('\d+', word_st) or \
                re.match('[^a-zA-Z0-9]', word_st) or\
                word in COMMON_FIRST_NAMES or \
                word in CUSTOM_STOP_WORDS or\
                word in ENGLISH_STOP_WORDS or \
                word_st in out_st:  # ignore stemming duplicate
            continue
        out_st.append(word_st)
        out.append(word)
        if len(out) >= n:
            break
    return out

stemming.py 文件源码项目：political-ad-classifier 作者: BoudhayanBanerjee 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def porter(inputpath=None, text=None):
    """
    docstring
    """
    data = ''
    p = PorterStemmer()
    if inputpath:
        filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)]
        pstemmed_list = []
        for file in filenames:
            with open(file, 'r') as f:
                data = f.read()
                if data:
                    texts = data.split(',')
                    stemmedfile = []
                    for text in texts:
                        pstemmed = p.stem(text)
                        stemmedfile.append(pstemmed)
            pstemmed_list.extend(stemmedfile)
        return pstemmed_list
    if text:
        pstemmed = p.stem(text)
        return pstemmed

utils.py 文件源码项目：patentdata 作者: benhoyle 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def stem_split(tokens):
    """ Takes a list of tokens and splits stemmed tokens into
    stem, ending - inserting ending as extra token.

    returns: revised (possibly longer) list of tokens. """
    stemmer = PorterStemmer()
    token_list = list()
    for token in tokens:
        stem = stemmer.stem(token)
        split_list = token.split(stem)
        if token == stem:
            token_list.append(token)
        elif len(split_list) > 1:
            token_list.append(stem)
            token_list.append(split_list[1])
        else:
            token_list.append(split_list[0])
    return token_list

utils.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def stem(words,stem_dic,mode="nltk",silent=1):
    if silent==0:
        print("stem ...")
    if mode == "nltk":
        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()
    else:
        print("unknown mode",mode)
        assert 0
    for word in set(words):
        if word not in stem_dic:
            stem_dic[word] = stemmer.stem(word)
    words = [stem_dic[word] for word in words]
    return words

preprocessing.py 文件源码项目：KATE 作者: hugochan 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def tiny_tokenize_xml(text, stem=False, stop_words=[]):
    return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
                        re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
                        not token.isdigit() and not token in stop_words]

stemmer.py 文件源码项目：chatbot_ner 作者: hellohaptik 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __porter_stemmer(self):
        """Initializes PorterStemmer

        Returns:
            Initializes PorterStemmer
        """
        self.stemmer = PorterStemmer()

reuters_classifier.py 文件源码项目：ml-projects 作者: saopayne 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def tokenize(text):

    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter(lambda token: p.match(token) and len(token) >= min_length, tokens))
    return filtered_tokens

lang_proc.py 文件源码项目：Search-Engine 作者: SoufianEly 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def __init__(self, full_word):
        self.full_word = full_word
        # TODO: Lemmatization requires downloads
        # wnl = WordNetLemmatizer()
        # lemmas = [wnl.lemmatize(token) for token in tokens]
        self.stem = PorterStemmer().stem(full_word).lower()

main.py 文件源码项目：AcronymExpansion 作者: adityathakker 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def get_list():
    stop_words = set(stopwords.words('english'))

    filename = 'data/new_acronyms.json'
    f = open(filename, 'r')
    data = json.load(f)
    paragraph_list = []
    full_form_list = []
    for k,v in data.items():
        if k=="WDM":
            for poss in v['possibilities']:
                paragraph_list.append(poss['summary'])
                full_form_list.append(poss['full_form'])
    s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly."
    paragraph_list.append(s)
    full_form_list.append("Wavelength context")
    texts = []
    taggeddoc = []
    p_stemmer = PorterStemmer()
    tokeniser = RegexpTokenizer(r'\w+')

    for index, para in enumerate(paragraph_list):
        raw = para.lower()

        tokens = tokeniser.tokenize(raw)
        stopped_tokens = [t for t in tokens if not t in stop_words]

        number_tokens = [x for x in stopped_tokens if x.isalpha]
        stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]

        length_tokens = [i for i in stemmed_tokens if len(i) > 1]
        texts.append(length_tokens)
        td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]])

        taggeddoc.append(td)

    return taggeddoc

similar_posts.py 文件源码项目：hugo_similar_posts 作者: elbaulp 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def tokenizer_porter(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split() if word not in stop]

# Cambiamos a este stemmer que tiene soporte para español

snowball.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)

spammer.py 文件源码项目：twitter_trolls 作者: merqurio 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def tweet_stemming(tweet, token_freqs):

    """
    Stems tweets words and counts diversty

    :param tweet: the tweet to analyze
    :type tweet: str or unicode

    :param token_freqs: counter of words frequency
    :type token_freqs: Counter

    :returns: words added to token_freqs
    :rtype: int
    """

    pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)'
    regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    porter = PorterStemmer()

    counter_tokens = 0
    tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE)  # remove URL
    tweet_url_removed_tokenized = word_tokenize(tweet_url_removed)  # tokenize tweet
    tweet_url_removed_tokenized_cleaned_stemming = []  # cleaned of URLs and hashs, and stemming

    for token in tweet_url_removed_tokenized:
        new_token = regex_punctuation.sub(u'', token)  # remove punctuation and hash
        if not new_token == u'':
            new_token_stemming = porter.stem(new_token)
            tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming)
            token_freqs[new_token_stemming] += 1
            counter_tokens += 1

    return counter_tokens

text_utilities.py 文件源码项目：asx-announce-analysis 作者: desiguel 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def stem_list(word_list):
    """
    Return a tokenised text list.
    :param word_list: word list to be stemmed.
    :return: list
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in word_list]

snowball.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)

sse_client.py 文件源码项目：Searchable-Symmetric-Encryption 作者: IanVanHoudt 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self):

        # TODO: placeholder for password. Will eventually take
        # as an arg of some sort
        self.password = b"password"

        # TODO: need to sort out use of salt. Previously, salt was
        # randomly generated in initKeys, but the resulting pass-
        # words k & kPrime were different on each execution, and 
        # decryption was impossible. Hardcoding salt makes dectyption
        # possible but may be a bad short cut
        self.iv = None
        self.salt = "$2b$12$ddTuco8zWXF2.kTqtOZa9O"

        # Two keys, generated/Initialized by KDF
        (self.k, self.kPrime) = self.initKeys()

        # Two K's: generated/initialized by PRF
        self.k1 = None
        self.k2 = None

        # client's cipher (AES w/ CBC)
        self.cipher = self.initCipher()

        # Stemming tool (cuts words to their roots/stems)
        self.stemmer = PorterStemmer()

utils.py 文件源码项目：patentdata 作者: benhoyle 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def stem(tokens):
    """ Stem passed text tokens. """
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

snowball.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)

training_classifier.py 文件源码项目：Trendster 作者: rawanhassunah 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def __init__(self):
        self.ps = PorterStemmer()

classifier.py 文件源码项目：Trendster 作者: rawanhassunah 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self):
        self.ps = PorterStemmer()

sentiment.py 文件源码项目：RottenCrawler 作者: kevin940726 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def getAllReviews(movieList):
    reviews = np.array(map(lambda x: x["reviews"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg'))

    return tokenizeReview

sentiment.py 文件源码项目：RottenCrawler 作者: kevin940726 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def getAllCritics(movieList):
    reviews = np.array(map(lambda x: x["critics"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg'))

    return tokenizeReview

reuters.py 文件源码项目：multilabel-classification 作者: jordicolomer 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter(lambda token: p.match(token) and
                                  len(token) >= min_length, tokens))
    return filtered_tokens

snowball.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)

chapter_8.py 文件源码项目：python-machine-learning-book 作者: jeremyn 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def tokenizer_porter(text):
    return [PorterStemmer().stem(word) for word in text.split()]

load.py 文件源码项目：graph-based-semi-supervised-learning 作者: deerishi 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def __init__(self): 
        self.stemmer = PorterStemmer()

graphssl.py 文件源码项目：graph-based-semi-supervised-learning 作者: deerishi 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def __init__(self): 
        self.stemmer = PorterStemmer()

snowball.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)

snowball.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)

lang_dependency.py 文件源码项目：b4msa 作者: INGEOTEC 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def __init__(self, lang="spanish"):
        """
        Initializes the parameters for specific language
        """
        self.languages = ["spanish", "english", "italian", "german"]
        self.lang = lang

        if self.lang not in self.languages:
            raise LangDependencyError("Language not supported: " + lang)

        self.stopwords = LangDependency.STOPWORDS_CACHE.get(lang, None)
        if self.stopwords is None:
            self.stopwords = self.load_stopwords(os.path.join(PATH, "{0}.stopwords".format(lang)))
            LangDependency.STOPWORDS_CACHE[lang] = self.stopwords

        self.neg_stopwords = LangDependency.NEG_STOPWORDS_CACHE.get(lang, None)
        if self.neg_stopwords is None:
            self.neg_stopwords = self.load_stopwords(os.path.join(PATH, "{0}.neg.stopwords".format(lang)))
            LangDependency.NEG_STOPWORDS_CACHE[lang] = self.neg_stopwords

        if self.lang not in SnowballStemmer.languages:
            raise LangDependencyError("Language not supported for stemming: " + lang)
        if self.lang == "english":
            self.stemmer = PorterStemmer()
        else:
            self.stemmer = SnowballStemmer(self.lang)