python类tokenize()的实例源码

corpus.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 50 收藏 0 点赞 0 评论 0
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in para:
                counts['sents'] += 1

                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }
lang_proc.py 文件源码 项目:Search-Engine 作者: SoufianEly 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def stem_and_tokenize_text(text):
    sents = sent_tokenize(text)
    tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
    terms = [Term(token) for token in tokens]
    return filter(lambda term: not term.is_punctuation(), terms)
utils.py 文件源码 项目:arowf 作者: priyankamandikal 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def get_words(text=''):
    words = []
    words = TOKENIZER.tokenize(text)
    filtered_words = []
    for word in words:
        if word in SPECIAL_CHARS or word == " ":
            pass
        else:
            new_word = word.replace(",","").replace(".","")
            new_word = new_word.replace("!","").replace("?","")
            filtered_words.append(new_word)
    return filtered_words
utils.py 文件源码 项目:arowf 作者: priyankamandikal 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def get_sentences(text=''):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)
    return sentences
rte_classify.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
chunked.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def read_block(self, stream):
        block = []
        for para_str in self._para_block_reader(stream):
            para = []
            for sent_str in self._sent_tokenizer.tokenize(para_str):
                sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset,
                                           target_tagset=self._target_tagset)

                # If requested, throw away the tags.
                if not self._tagged:
                    sent = self._untag(sent)

                # If requested, throw away the chunks.
                if not self._chunked:
                    sent = sent.leaves()

                # Add the sentence to `para`.
                if self._group_by_sent:
                    para.append(sent)
                else:
                    para.extend(sent)

            # Add the paragraph to `block`.
            if self._group_by_para:
                block.append(para)
            else:
                block.extend(para)

        # Return the block
        return block
util.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            y.append(0) # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
quiz.py 文件源码 项目:polyglot-quiz 作者: erkanay 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def tokenize(self, _text):
        if self.regexp:
            return RegexpTokenizer(self.regexp).tokenize(_text)
        return word_tokenize(_text, language=self.language)
quiz.py 文件源码 项目:polyglot-quiz 作者: erkanay 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_words(self, _text):
        return self.tokenize(_text)
tweet_analyzer.py 文件源码 项目:tweet_analyzer 作者: atandy 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def tweet_tokenize(self, tweet):
        #http://www.nltk.org/api/nltk.tokenize.html
        tknzr = TweetTokenizer()
        tokens = tknzr.tokenize(tweet)
        return tokens
data.py 文件源码 项目:dong_iccv_2017 作者: woozzu 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def split_sentence_into_words(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(sentence.lower())
test_Doc2Vec.py 文件源码 项目:itunes 作者: kaminem64 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def tokenize(text):
  # lowers = text.lower()
  # no_punctuation = lowers.translate(None, string.punctuation)
  time0 = time.time()
  # tokens = [word[0] for word in TextBlob(unicode(TextBlob(text).correct())).tags if word[1] in ['NN', 'NNS', 'NNP', 'JJ', 'VB'] ]
  # stems = stem_tokens(tokens, stemmer)

  stems = re.findall('[a-z]+', text)
  # stems = [word[0] for word in nltk.pos_tag(tokens) if word[1] in ['NN', 'NNS', 'NNP', 'JJ', 'VB'] ]

  print('%s seconds' % (time.time()-time0))
  print(stems)
  return stems
lda.py 文件源码 项目:itunes 作者: kaminem64 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def tokenize(text):
  # lowers = text.lower()
  # no_punctuation = lowers.translate(None, string.punctuation)
  time0 = time.time()
  # tokens = [word[0] for word in TextBlob(unicode(TextBlob(text).correct())).tags if word[1] in ['NN', 'NNS', 'NNP', 'JJ', 'VB'] ]
  # stems = stem_tokens(tokens, stemmer)

  stems = re.findall('[a-z]+', text)
  # stems = [word[0] for word in nltk.pos_tag(tokens) if word[1] in ['NN', 'NNS', 'NNP', 'JJ', 'VB'] ]

  print('%s seconds' % (time.time()-time0))
  print(stems)
  return stems
spammer.py 文件源码 项目:twitter_trolls 作者: merqurio 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def tweet_stemming(tweet, token_freqs):

    """
    Stems tweets words and counts diversty

    :param tweet: the tweet to analyze
    :type tweet: str or unicode

    :param token_freqs: counter of words frequency
    :type token_freqs: Counter

    :returns: words added to token_freqs
    :rtype: int
    """

    pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)'
    regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    porter = PorterStemmer()

    counter_tokens = 0
    tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE)  # remove URL
    tweet_url_removed_tokenized = word_tokenize(tweet_url_removed)  # tokenize tweet
    tweet_url_removed_tokenized_cleaned_stemming = []  # cleaned of URLs and hashs, and stemming

    for token in tweet_url_removed_tokenized:
        new_token = regex_punctuation.sub(u'', token)  # remove punctuation and hash
        if not new_token == u'':
            new_token_stemming = porter.stem(new_token)
            tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming)
            token_freqs[new_token_stemming] += 1
            counter_tokens += 1

    return counter_tokens
model_fitting.py 文件源码 项目:Twitter_Geolocation 作者: shawn-terryah 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def tokenize(tweet):
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
    return tknzr.tokenize(tweet)

# Read cleaned training tweets file into pandas and randomize it
readability_utils.py 文件源码 项目:django-icekit 作者: ic-labs 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def get_words(text=''):
    words = []
    words = TOKENIZER.tokenize(text)
    filtered_words = []
    for word in words:
        if word in SPECIAL_CHARS or word == " ":
            pass
        else:
            new_word = word.replace(",","").replace(".","")
            new_word = new_word.replace("!","").replace("?","")
            filtered_words.append(new_word)
    return filtered_words
rte_classify.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
chunked.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def read_block(self, stream):
        block = []
        for para_str in self._para_block_reader(stream):
            para = []
            for sent_str in self._sent_tokenizer.tokenize(para_str):
                sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset,
                                           target_tagset=self._target_tagset)

                # If requested, throw away the tags.
                if not self._tagged:
                    sent = self._untag(sent)

                # If requested, throw away the chunks.
                if not self._chunked:
                    sent = sent.leaves()

                # Add the sentence to `para`.
                if self._group_by_sent:
                    para.append(sent)
                else:
                    para.extend(sent)

            # Add the paragraph to `block`.
            if self._group_by_para:
                block.append(para)
            else:
                block.extend(para)

        # Return the block
        return block
util.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            y.append(0) # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])


问题


面经


文章

微信
公众号

扫码关注公众号