python类PorterStemmer()的实例源码

preprocessing.py 文件源码 项目:KATE 作者: hugochan 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]
base.py 文件源码 项目:FreeDiscovery 作者: FreeDiscovery 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def select_top_words(word_list, n=10):
    """ Filter out cluster term names"""
    import re
    from nltk.stem.porter import PorterStemmer
    from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
    st = PorterStemmer()
    out_st = []
    out = []
    for word in word_list:
        word_st = st.stem(word)
        if len(word_st) <= 2 or\
                re.match('\d+', word_st) or \
                re.match('[^a-zA-Z0-9]', word_st) or\
                word in COMMON_FIRST_NAMES or \
                word in CUSTOM_STOP_WORDS or\
                word in ENGLISH_STOP_WORDS or \
                word_st in out_st:  # ignore stemming duplicate
            continue
        out_st.append(word_st)
        out.append(word)
        if len(out) >= n:
            break
    return out
stemming.py 文件源码 项目:political-ad-classifier 作者: BoudhayanBanerjee 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def porter(inputpath=None, text=None):
    """
    docstring
    """
    data = ''
    p = PorterStemmer()
    if inputpath:
        filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)]
        pstemmed_list = []
        for file in filenames:
            with open(file, 'r') as f:
                data = f.read()
                if data:
                    texts = data.split(',')
                    stemmedfile = []
                    for text in texts:
                        pstemmed = p.stem(text)
                        stemmedfile.append(pstemmed)
            pstemmed_list.extend(stemmedfile)
        return pstemmed_list
    if text:
        pstemmed = p.stem(text)
        return pstemmed
utils.py 文件源码 项目:patentdata 作者: benhoyle 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def stem_split(tokens):
    """ Takes a list of tokens and splits stemmed tokens into
    stem, ending - inserting ending as extra token.

    returns: revised (possibly longer) list of tokens. """
    stemmer = PorterStemmer()
    token_list = list()
    for token in tokens:
        stem = stemmer.stem(token)
        split_list = token.split(stem)
        if token == stem:
            token_list.append(token)
        elif len(split_list) > 1:
            token_list.append(stem)
            token_list.append(split_list[1])
        else:
            token_list.append(split_list[0])
    return token_list
utils.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def stem(words,stem_dic,mode="nltk",silent=1):
    if silent==0:
        print("stem ...")
    if mode == "nltk":
        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()
    else:
        print("unknown mode",mode)
        assert 0
    for word in set(words):
        if word not in stem_dic:
            stem_dic[word] = stemmer.stem(word)
    words = [stem_dic[word] for word in words]
    return words
preprocessing.py 文件源码 项目:KATE 作者: hugochan 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def tiny_tokenize_xml(text, stem=False, stop_words=[]):
    return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
                        re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
                        not token.isdigit() and not token in stop_words]
stemmer.py 文件源码 项目:chatbot_ner 作者: hellohaptik 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __porter_stemmer(self):
        """Initializes PorterStemmer

        Returns:
            Initializes PorterStemmer
        """
        self.stemmer = PorterStemmer()
reuters_classifier.py 文件源码 项目:ml-projects 作者: saopayne 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def tokenize(text):

    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter(lambda token: p.match(token) and len(token) >= min_length, tokens))
    return filtered_tokens
lang_proc.py 文件源码 项目:Search-Engine 作者: SoufianEly 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self, full_word):
        self.full_word = full_word
        # TODO: Lemmatization requires downloads
        # wnl = WordNetLemmatizer()
        # lemmas = [wnl.lemmatize(token) for token in tokens]
        self.stem = PorterStemmer().stem(full_word).lower()
main.py 文件源码 项目:AcronymExpansion 作者: adityathakker 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_list():
    stop_words = set(stopwords.words('english'))

    filename = 'data/new_acronyms.json'
    f = open(filename, 'r')
    data = json.load(f)
    paragraph_list = []
    full_form_list = []
    for k,v in data.items():
        if k=="WDM":
            for poss in v['possibilities']:
                paragraph_list.append(poss['summary'])
                full_form_list.append(poss['full_form'])
    s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly."
    paragraph_list.append(s)
    full_form_list.append("Wavelength context")
    texts = []
    taggeddoc = []
    p_stemmer = PorterStemmer()
    tokeniser = RegexpTokenizer(r'\w+')

    for index, para in enumerate(paragraph_list):
        raw = para.lower()

        tokens = tokeniser.tokenize(raw)
        stopped_tokens = [t for t in tokens if not t in stop_words]

        number_tokens = [x for x in stopped_tokens if x.isalpha]
        stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]

        length_tokens = [i for i in stemmed_tokens if len(i) > 1]
        texts.append(length_tokens)
        td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]])

        taggeddoc.append(td)

    return taggeddoc
similar_posts.py 文件源码 项目:hugo_similar_posts 作者: elbaulp 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def tokenizer_porter(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split() if word not in stop]

# Cambiamos a este stemmer que tiene soporte para español
snowball.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
spammer.py 文件源码 项目:twitter_trolls 作者: merqurio 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def tweet_stemming(tweet, token_freqs):

    """
    Stems tweets words and counts diversty

    :param tweet: the tweet to analyze
    :type tweet: str or unicode

    :param token_freqs: counter of words frequency
    :type token_freqs: Counter

    :returns: words added to token_freqs
    :rtype: int
    """

    pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)'
    regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    porter = PorterStemmer()

    counter_tokens = 0
    tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE)  # remove URL
    tweet_url_removed_tokenized = word_tokenize(tweet_url_removed)  # tokenize tweet
    tweet_url_removed_tokenized_cleaned_stemming = []  # cleaned of URLs and hashs, and stemming

    for token in tweet_url_removed_tokenized:
        new_token = regex_punctuation.sub(u'', token)  # remove punctuation and hash
        if not new_token == u'':
            new_token_stemming = porter.stem(new_token)
            tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming)
            token_freqs[new_token_stemming] += 1
            counter_tokens += 1

    return counter_tokens
text_utilities.py 文件源码 项目:asx-announce-analysis 作者: desiguel 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def stem_list(word_list):
    """
    Return a tokenised text list.
    :param word_list: word list to be stemmed.
    :return: list
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in word_list]
snowball.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
sse_client.py 文件源码 项目:Searchable-Symmetric-Encryption 作者: IanVanHoudt 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self):

        # TODO: placeholder for password. Will eventually take
        # as an arg of some sort
        self.password = b"password"

        # TODO: need to sort out use of salt. Previously, salt was
        # randomly generated in initKeys, but the resulting pass-
        # words k & kPrime were different on each execution, and 
        # decryption was impossible. Hardcoding salt makes dectyption
        # possible but may be a bad short cut
        self.iv = None
        self.salt = "$2b$12$ddTuco8zWXF2.kTqtOZa9O"

        # Two keys, generated/Initialized by KDF
        (self.k, self.kPrime) = self.initKeys()

        # Two K's: generated/initialized by PRF
        self.k1 = None
        self.k2 = None

        # client's cipher (AES w/ CBC)
        self.cipher = self.initCipher()

        # Stemming tool (cuts words to their roots/stems)
        self.stemmer = PorterStemmer()
utils.py 文件源码 项目:patentdata 作者: benhoyle 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def stem(tokens):
    """ Stem passed text tokens. """
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]
snowball.py 文件源码 项目:neighborhood_mood_aws 作者: jarrellmark 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
training_classifier.py 文件源码 项目:Trendster 作者: rawanhassunah 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self):
        self.ps = PorterStemmer()
classifier.py 文件源码 项目:Trendster 作者: rawanhassunah 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self):
        self.ps = PorterStemmer()
sentiment.py 文件源码 项目:RottenCrawler 作者: kevin940726 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def getAllReviews(movieList):
    reviews = np.array(map(lambda x: x["reviews"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg'))

    return tokenizeReview
sentiment.py 文件源码 项目:RottenCrawler 作者: kevin940726 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def getAllCritics(movieList):
    reviews = np.array(map(lambda x: x["critics"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg'))

    return tokenizeReview
reuters.py 文件源码 项目:multilabel-classification 作者: jordicolomer 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter(lambda token: p.match(token) and
                                  len(token) >= min_length, tokens))
    return filtered_tokens
snowball.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
chapter_8.py 文件源码 项目:python-machine-learning-book 作者: jeremyn 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def tokenizer_porter(text):
    return [PorterStemmer().stem(word) for word in text.split()]
load.py 文件源码 项目:graph-based-semi-supervised-learning 作者: deerishi 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def __init__(self): 
        self.stemmer = PorterStemmer()
graphssl.py 文件源码 项目:graph-based-semi-supervised-learning 作者: deerishi 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self): 
        self.stemmer = PorterStemmer()
snowball.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
snowball.py 文件源码 项目:beepboop 作者: nicolehe 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
lang_dependency.py 文件源码 项目:b4msa 作者: INGEOTEC 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self, lang="spanish"):
        """
        Initializes the parameters for specific language
        """
        self.languages = ["spanish", "english", "italian", "german"]
        self.lang = lang

        if self.lang not in self.languages:
            raise LangDependencyError("Language not supported: " + lang)

        self.stopwords = LangDependency.STOPWORDS_CACHE.get(lang, None)
        if self.stopwords is None:
            self.stopwords = self.load_stopwords(os.path.join(PATH, "{0}.stopwords".format(lang)))
            LangDependency.STOPWORDS_CACHE[lang] = self.stopwords

        self.neg_stopwords = LangDependency.NEG_STOPWORDS_CACHE.get(lang, None)
        if self.neg_stopwords is None:
            self.neg_stopwords = self.load_stopwords(os.path.join(PATH, "{0}.neg.stopwords".format(lang)))
            LangDependency.NEG_STOPWORDS_CACHE[lang] = self.neg_stopwords

        if self.lang not in SnowballStemmer.languages:
            raise LangDependencyError("Language not supported for stemming: " + lang)
        if self.lang == "english":
            self.stemmer = PorterStemmer()
        else:
            self.stemmer = SnowballStemmer(self.lang)


问题


面经


文章

微信
公众号

扫码关注公众号