python类TweetTokenizer()的实例源码

utils.py 文件源码 项目:LinguisticAnalysis 作者: DucAnhPhi 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def remove_emoticons(text):
    # build regexp with imported emoticon list
    smileys = '|'.join(map(re.escape, emoticons))
    emoticonsPattern = re.compile('({})'.format(smileys), flags=re.IGNORECASE)
    removed = re.sub(emoticonsPattern, '', text)
    # remove unnecessary white spaces utilizing the TweetTokenizer
    removed = tokenize(removed)
    return " ".join(sum(removed, []))
test_tokenize.py 文件源码 项目:kind2anki 作者: prz3m 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_tweet_tokenizer(self):
        """
        Test TweetTokenizer using words with special and accented characters.
        """

        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        s9 = "@myke: Let's test these words: resumé España München français"
        tokens = tokenizer.tokenize(s9)
        expected = [':', "Let's", 'test', 'these', 'words', ':', 'resumé',
                    'España', 'München', 'français']
        self.assertEqual(tokens, expected)
test_tokenize.py 文件源码 项目:but_sentiment 作者: MixedEmotions 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def test_tweet_tokenizer(self):
        """
        Test TweetTokenizer using words with special and accented characters.
        """

        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        s9 = "@myke: Let's test these words: resumé España München français"
        tokens = tokenizer.tokenize(s9)
        expected = [':', "Let's", 'test', 'these', 'words', ':', 'resumé',
                    'España', 'München', 'français']
        self.assertEqual(tokens, expected)
twitter_extractor.py 文件源码 项目:tidyextractors 作者: networks-lab 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def _get_user_tweets(self, screen_name):

        # TODO: Implement tweet limit

        # Twitter only allows access to a users most recent 3240 tweets with this method

        # initialize a list to hold all the tweepy Tweets
        alltweets = []

        # make initial request for most recent tweets (200 is the maximum allowed count)
        new_tweets = self._api.user_timeline(screen_name = screen_name,count=200)

        # save most recent tweets
        alltweets.extend(new_tweets)

        # save the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        # keep grabbing tweets until there are no tweets left to grab
        while len(new_tweets) > 0:

            # all subsequent requests use the max_id param to prevent duplicates
            new_tweets = self._api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)

            # save most recent tweets
            alltweets.extend(new_tweets)

            # update the id of the oldest tweet less one
            oldest = alltweets[-1].id - 1

        # transform the tweepy tweets into a 2D array that will populate the csv
        outtweets = {tweet.id_str: {'created':tweet.created_at,'text':tweet.text} for tweet in alltweets}

        # Twitter-aware tokenizer
        tknzr = TweetTokenizer()

        # Extend data with linguistic processing
        for tweet_id in outtweets:

            # Get tweet data from dictionary
            tweet = outtweets[tweet_id]

            # Lowercase tokenized tweet text
            tweet_tokens = tknzr.tokenize(tweet['text'])

            # Parts-of-speech tags for tokenized text
            tweet_pos = nltk.pos_tag(tweet_tokens)

            # Is the tweet a rewteet?
            tweet['retweet'] = tweet_pos[0][0] == 'RT'

            # If retweeted, who was the original author?

            if tweet['retweet'] is True:
                tweet['rt_author'] = tweet_pos[1][0]
            else:
                tweet['rt_author'] = ''

        return outtweets

# TODO: Might have encoding issues. See: https://stackoverflow.com/questions/6539881/python-converting-from-iso-8859-1-latin1-to-utf-8
text_preprocessing.py 文件源码 项目:itunes 作者: kaminem64 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def k_tokenizer(text):
    text = text.encode('ascii',errors='ignore').replace('-', '')
    """ We should use a better way to remove non-english words """

    tokenizer = TweetTokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(text)

    # stopset = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stopset]

    """ Synonyms using wordnet """

    mwe_tokenizer = MWETokenizer([('ios', '9'),])
    mwe_tokens = mwe_tokenizer.tokenize(tokens)

    """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """

    """ train -> train_NN train_V"""
    tagged = nltk.pos_tag(mwe_tokens)

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN # we preserve the original form of any unknown word

    wordnet_lemmatizer = WordNetLemmatizer()
    final_doc=[]
    for token, tag in tagged:
        word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag))
        final_doc.append(word)

    # porter = PorterStemmer()
    # final_doc=[]
    # for token in mwe_tokens:
    #     final_doc.append(porter.stem(token))

    return final_doc
data_handler.py 文件源码 项目:SarcasmDetection 作者: AniSkywalker 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parsedata(lines, word_list, split_word_list, emoji_dict, normalize_text=False, split_hashtag=False,
              ignore_profiles=False,
              lowercase=False, replace_emoji=True):
    data = []
    for i, line in enumerate(lines):
        if (i % 100 == 0):
            print(str(i) + '...', end='', flush=True)

        try:
            # convert the line to lowercase
            if (lowercase):
                line = line.lower()

            # split into token
            token = line.split('\t')

            # label
            label = int(token[1].strip())

            # tweet text
            target_text = TweetTokenizer().tokenize(token[2].strip())

            # filter text
            target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, normalize_text,
                                      split_hashtag,
                                      ignore_profiles, replace_emoji=replace_emoji)

            # awc dimensions
            dimensions = []
            if (len(token) > 3 and token[3].strip() != 'NA'):
                dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]

            # context tweet
            context = []
            if (len(token) > 4):
                if (token[4] != 'NA'):
                    context = TweetTokenizer().tokenize(token[4].strip())
                    context = filter_text(context, word_list, normalize_text, split_hashtag, ignore_profiles)

            # author
            author = 'NA'
            if (len(token) > 5):
                author = token[5]

            if (len(target_text) != 0):
                # print((label, target_text, dimensions, context, author))
                data.append((label, target_text, dimensions, context, author))
        except:
            raise
    print('')
    return data
data_helpers.py 文件源码 项目:DeepLearning-On-Tweets 作者: ydj0604 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def load_data_and_labels_semeval():
    # load the entire semeval dataset
    old_dataset = list(open("./input/2013-dev"))
    old_dataset.extend(list(open("./input/2013-devtest")))
    old_dataset.extend(list(open("./input/2013-train")))
    old_dataset.extend(list(open("./input/2014-devtest")))

    new_dataset = list(open("./input/2016-train"))
    new_dataset.extend(list(open("./input/2016-dev")))
    new_dataset.extend(list(open("./input/2016-devtest")))

    # filter out invalid tweets from new dataset
    new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n']

    # generate x from old
    tk = TweetTokenizer(reduce_len=True) # handles punctuations
    x_text = [entry.split('\t')[3] for entry in old_dataset]
    x_text = [clean_str(tweet) for tweet in x_text]
    x_text = [tk.tokenize(tweet) for tweet in x_text]

    # generate x from new
    x_text_new = [entry.split('\t')[2] for entry in new_dataset]
    x_text_new = [clean_str(tweet) for tweet in x_text_new]
    x_text_new = [tk.tokenize(tweet) for tweet in x_text_new]

    # concat x and x_new
    x_text.extend(x_text_new)

    # generate y from old
    y = [entry.split('\t')[2] for entry in old_dataset]
    for idx, label in enumerate(y):
        if label == 'positive':
            y[idx] = [1, 0, 0]
        elif label == 'neutral':
            y[idx] = [0, 1, 0]
        elif label == 'negative':
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in semeval: ' + label

    # generate y from new
    y_new = [entry.split('\t')[1] for entry in new_dataset]
    for idx, label in enumerate(y_new):
        if label == 'positive':
            y_new[idx] = [1, 0, 0]
        elif label == 'neutral':
            y_new[idx] = [0, 1, 0]
        elif label == 'negative':
            y_new[idx] = [0, 0, 1]
        else:
            print 'wrong label in semeval: ' + label

    # concat y and y_new
    y.extend(y_new)

    return [x_text, y]
parse_tweets.py 文件源码 项目:deep-hashtagprediction 作者: jderiu 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def store_file(f_in, f_out, alphabet_words,alphabet_hashtags, dummy_word_idx, hashtag_fname=None):
    #stores the tweets in batches so it fits in memory
    tknzr = TweetTokenizer(reduce_len=True)
    counter = 0
    batch_counter = 0
    output = open(f_out,'wb')
    output_hashtag = open(hashtag_fname, 'wb')
    batch_size = 500000
    tweet_batch = []
    hashtag_batch=[]
    with gzip.open(f_in,'r') as f:
        for tweet in f:
            tweet,hashtags = preprocess_tweet(tweet)
            if len(hashtags) == 1:
                ht = hashtags[0]
                alphabet_hashtags.add(ht)
                ht_idx = alphabet_hashtags.get(ht,UNKNOWN_HASHTAG_IDX)

                tweet = tweet.replace(ht,'')
                tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
                tweet_batch.append(tweet_tok)
                hashtag_batch.append(ht_idx)

                batch_counter += 1

                for token in tweet_tok:
                    alphabet_words.add(token)

                if batch_counter >= batch_size:
                    tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
                    np.save(output,tweet_idx)
                    np.save(output_hashtag,hashtag_batch)
                    print 'Saved tweets:',tweet_idx.shape
                    tweet_batch = []
                    hashtag_batch=[]
                    batch_counter = 0
                counter += 1
                if (counter%1000000) == 0:
                    print "Elements processed:",counter

    tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
    np.save(output,tweet_idx)
    np.save(output_hashtag,hashtag_batch)
    print len(alphabet_hashtags)
    print len(alphabet_words)
    print 'Saved tweets:',tweet_idx.shape
    return counter
paper.py 文件源码 项目:sharead 作者: strin 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def inverse_indexing_once():
    kv_paperwords = lambda filehash: KeyValueStore('paperwords:' + filehash)
    scopes = KeyValueStore.scopes('paper:*')
    from nltk.tokenize import TweetTokenizer
    tokenizer = TweetTokenizer()
    def make_dict(text, weight=1., prefix_weight=0.):
        if not text:
            return {}
        words = tokenizer.tokenize(text.lower().strip())
        result = {}
        for word in words:
            for i in range(1, len(word)):
                prefix = word[:i]
                if prefix not in result:
                    result[prefix] = 0.
                result[prefix] += prefix_weight
            if word not in result:
                result[word] = 0.
            result[word] += weight
        return result

    def merge_dict(dict1, dict2):
        new_dict = {}
        for word in set(dict1.keys()).union(dict2.keys()):
            weight1 = dict1.get(word, 0.)
            weight2 = dict2.get(word, 0.)
            new_dict[word] = weight1 + weight2
        return new_dict

    for scope in scopes:
        filehash = scope[len('paper:'):]
        meta = KeyValueStore(scope_name=scope)
        title = meta['title']
        abstract = meta.get('abstract', default='')

        dict_title = make_dict(title, weight=6., prefix_weight=0.06)
        dict_abstract = make_dict(abstract, weight=2., prefix_weight=0.02)
        final_dict = merge_dict(dict_title, dict_abstract)

        authors = meta['authors']
        if authors:
            for author in authors:
                dict_author = make_dict(author['first_name'] + ' ' + author['last_name'])
                final_dict = merge_dict(dict_author, final_dict)

        kv_paperwords(filehash).update(final_dict)


问题


面经


文章

微信
公众号

扫码关注公众号