def remove_emoticons(text):
# build regexp with imported emoticon list
smileys = '|'.join(map(re.escape, emoticons))
emoticonsPattern = re.compile('({})'.format(smileys), flags=re.IGNORECASE)
removed = re.sub(emoticonsPattern, '', text)
# remove unnecessary white spaces utilizing the TweetTokenizer
removed = tokenize(removed)
return " ".join(sum(removed, []))
python类TweetTokenizer()的实例源码
def test_tweet_tokenizer(self):
"""
Test TweetTokenizer using words with special and accented characters.
"""
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
s9 = "@myke: Let's test these words: resumé España München français"
tokens = tokenizer.tokenize(s9)
expected = [':', "Let's", 'test', 'these', 'words', ':', 'resumé',
'España', 'München', 'français']
self.assertEqual(tokens, expected)
def test_tweet_tokenizer(self):
"""
Test TweetTokenizer using words with special and accented characters.
"""
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
s9 = "@myke: Let's test these words: resumé España München français"
tokens = tokenizer.tokenize(s9)
expected = [':', "Let's", 'test', 'these', 'words', ':', 'resumé',
'España', 'München', 'français']
self.assertEqual(tokens, expected)
def _get_user_tweets(self, screen_name):
# TODO: Implement tweet limit
# Twitter only allows access to a users most recent 3240 tweets with this method
# initialize a list to hold all the tweepy Tweets
alltweets = []
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = self._api.user_timeline(screen_name = screen_name,count=200)
# save most recent tweets
alltweets.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
# all subsequent requests use the max_id param to prevent duplicates
new_tweets = self._api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
# save most recent tweets
alltweets.extend(new_tweets)
# update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
# transform the tweepy tweets into a 2D array that will populate the csv
outtweets = {tweet.id_str: {'created':tweet.created_at,'text':tweet.text} for tweet in alltweets}
# Twitter-aware tokenizer
tknzr = TweetTokenizer()
# Extend data with linguistic processing
for tweet_id in outtweets:
# Get tweet data from dictionary
tweet = outtweets[tweet_id]
# Lowercase tokenized tweet text
tweet_tokens = tknzr.tokenize(tweet['text'])
# Parts-of-speech tags for tokenized text
tweet_pos = nltk.pos_tag(tweet_tokens)
# Is the tweet a rewteet?
tweet['retweet'] = tweet_pos[0][0] == 'RT'
# If retweeted, who was the original author?
if tweet['retweet'] is True:
tweet['rt_author'] = tweet_pos[1][0]
else:
tweet['rt_author'] = ''
return outtweets
# TODO: Might have encoding issues. See: https://stackoverflow.com/questions/6539881/python-converting-from-iso-8859-1-latin1-to-utf-8
def k_tokenizer(text):
text = text.encode('ascii',errors='ignore').replace('-', '')
""" We should use a better way to remove non-english words """
tokenizer = TweetTokenizer(preserve_case=False)
tokens = tokenizer.tokenize(text)
# stopset = set(stopwords.words('english'))
# tokens = [word for word in tokens if not word in stopset]
""" Synonyms using wordnet """
mwe_tokenizer = MWETokenizer([('ios', '9'),])
mwe_tokens = mwe_tokenizer.tokenize(tokens)
""" We might want to tokenize by sentence and then tag each sentence and aggregate the results """
""" train -> train_NN train_V"""
tagged = nltk.pos_tag(mwe_tokens)
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN # we preserve the original form of any unknown word
wordnet_lemmatizer = WordNetLemmatizer()
final_doc=[]
for token, tag in tagged:
word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag))
final_doc.append(word)
# porter = PorterStemmer()
# final_doc=[]
# for token in mwe_tokens:
# final_doc.append(porter.stem(token))
return final_doc
def parsedata(lines, word_list, split_word_list, emoji_dict, normalize_text=False, split_hashtag=False,
ignore_profiles=False,
lowercase=False, replace_emoji=True):
data = []
for i, line in enumerate(lines):
if (i % 100 == 0):
print(str(i) + '...', end='', flush=True)
try:
# convert the line to lowercase
if (lowercase):
line = line.lower()
# split into token
token = line.split('\t')
# label
label = int(token[1].strip())
# tweet text
target_text = TweetTokenizer().tokenize(token[2].strip())
# filter text
target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, normalize_text,
split_hashtag,
ignore_profiles, replace_emoji=replace_emoji)
# awc dimensions
dimensions = []
if (len(token) > 3 and token[3].strip() != 'NA'):
dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]
# context tweet
context = []
if (len(token) > 4):
if (token[4] != 'NA'):
context = TweetTokenizer().tokenize(token[4].strip())
context = filter_text(context, word_list, normalize_text, split_hashtag, ignore_profiles)
# author
author = 'NA'
if (len(token) > 5):
author = token[5]
if (len(target_text) != 0):
# print((label, target_text, dimensions, context, author))
data.append((label, target_text, dimensions, context, author))
except:
raise
print('')
return data
def load_data_and_labels_semeval():
# load the entire semeval dataset
old_dataset = list(open("./input/2013-dev"))
old_dataset.extend(list(open("./input/2013-devtest")))
old_dataset.extend(list(open("./input/2013-train")))
old_dataset.extend(list(open("./input/2014-devtest")))
new_dataset = list(open("./input/2016-train"))
new_dataset.extend(list(open("./input/2016-dev")))
new_dataset.extend(list(open("./input/2016-devtest")))
# filter out invalid tweets from new dataset
new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n']
# generate x from old
tk = TweetTokenizer(reduce_len=True) # handles punctuations
x_text = [entry.split('\t')[3] for entry in old_dataset]
x_text = [clean_str(tweet) for tweet in x_text]
x_text = [tk.tokenize(tweet) for tweet in x_text]
# generate x from new
x_text_new = [entry.split('\t')[2] for entry in new_dataset]
x_text_new = [clean_str(tweet) for tweet in x_text_new]
x_text_new = [tk.tokenize(tweet) for tweet in x_text_new]
# concat x and x_new
x_text.extend(x_text_new)
# generate y from old
y = [entry.split('\t')[2] for entry in old_dataset]
for idx, label in enumerate(y):
if label == 'positive':
y[idx] = [1, 0, 0]
elif label == 'neutral':
y[idx] = [0, 1, 0]
elif label == 'negative':
y[idx] = [0, 0, 1]
else:
print 'wrong label in semeval: ' + label
# generate y from new
y_new = [entry.split('\t')[1] for entry in new_dataset]
for idx, label in enumerate(y_new):
if label == 'positive':
y_new[idx] = [1, 0, 0]
elif label == 'neutral':
y_new[idx] = [0, 1, 0]
elif label == 'negative':
y_new[idx] = [0, 0, 1]
else:
print 'wrong label in semeval: ' + label
# concat y and y_new
y.extend(y_new)
return [x_text, y]
def store_file(f_in, f_out, alphabet_words,alphabet_hashtags, dummy_word_idx, hashtag_fname=None):
#stores the tweets in batches so it fits in memory
tknzr = TweetTokenizer(reduce_len=True)
counter = 0
batch_counter = 0
output = open(f_out,'wb')
output_hashtag = open(hashtag_fname, 'wb')
batch_size = 500000
tweet_batch = []
hashtag_batch=[]
with gzip.open(f_in,'r') as f:
for tweet in f:
tweet,hashtags = preprocess_tweet(tweet)
if len(hashtags) == 1:
ht = hashtags[0]
alphabet_hashtags.add(ht)
ht_idx = alphabet_hashtags.get(ht,UNKNOWN_HASHTAG_IDX)
tweet = tweet.replace(ht,'')
tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
tweet_batch.append(tweet_tok)
hashtag_batch.append(ht_idx)
batch_counter += 1
for token in tweet_tok:
alphabet_words.add(token)
if batch_counter >= batch_size:
tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
np.save(output,tweet_idx)
np.save(output_hashtag,hashtag_batch)
print 'Saved tweets:',tweet_idx.shape
tweet_batch = []
hashtag_batch=[]
batch_counter = 0
counter += 1
if (counter%1000000) == 0:
print "Elements processed:",counter
tweet_idx = convert2indices(tweet_batch, alphabet_words, dummy_word_idx)
np.save(output,tweet_idx)
np.save(output_hashtag,hashtag_batch)
print len(alphabet_hashtags)
print len(alphabet_words)
print 'Saved tweets:',tweet_idx.shape
return counter
def inverse_indexing_once():
kv_paperwords = lambda filehash: KeyValueStore('paperwords:' + filehash)
scopes = KeyValueStore.scopes('paper:*')
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
def make_dict(text, weight=1., prefix_weight=0.):
if not text:
return {}
words = tokenizer.tokenize(text.lower().strip())
result = {}
for word in words:
for i in range(1, len(word)):
prefix = word[:i]
if prefix not in result:
result[prefix] = 0.
result[prefix] += prefix_weight
if word not in result:
result[word] = 0.
result[word] += weight
return result
def merge_dict(dict1, dict2):
new_dict = {}
for word in set(dict1.keys()).union(dict2.keys()):
weight1 = dict1.get(word, 0.)
weight2 = dict2.get(word, 0.)
new_dict[word] = weight1 + weight2
return new_dict
for scope in scopes:
filehash = scope[len('paper:'):]
meta = KeyValueStore(scope_name=scope)
title = meta['title']
abstract = meta.get('abstract', default='')
dict_title = make_dict(title, weight=6., prefix_weight=0.06)
dict_abstract = make_dict(abstract, weight=2., prefix_weight=0.02)
final_dict = merge_dict(dict_title, dict_abstract)
authors = meta['authors']
if authors:
for author in authors:
dict_author = make_dict(author['first_name'] + ' ' + author['last_name'])
final_dict = merge_dict(dict_author, final_dict)
kv_paperwords(filehash).update(final_dict)