def tokenize_text( sample_text ):
global sequence_lengths
processed_text = []
if cfg.remove_punctuation:
cleaned = sample_text.lower().translate( t_table )
else:
cleaned = sample_text
if cfg.use_casual_tokenizer:
tokens = tknzr.tokenize( cleaned )
else:
tokens = nltk.word_tokenize( cleaned, language='english')
if cfg.remove_stopwords:
tokens = [w for w in tokens if not w in stopwords.words('english')]
sequence_lengths.append( len( tokens ) )
processed_text.extend( tokens )
return processed_text
python类tokenize()的实例源码
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
def get_sentences(text=''):
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = tokenizer.tokenize(text)
return sentences
util.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
def get_input_sequence(sentence):
"""
Prepare chatbot's input by tokenizing the sentence and adding necessary punctuation marks.
Input: "So what's up, buddy"
Output: ["so", "what", "'", "s", "up", ",", "buddy", ".", "$$$"]
"""
if not sentence:
return [START_TOKEN, EOS_SYMBOL]
# add a dot to the end of the sent in case there is no punctuation mark
if sentence[-1] not in _PUNKT_MARKS:
sentence += '.'
sequence = [START_TOKEN] + tokenize(sentence) + [EOS_SYMBOL]
return sequence
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
def demo_liu_hu_lexicon(sentence, plot=False):
"""
Basic example of sentiment classification using Liu and Hu opinion lexicon.
This function simply counts the number of positive, negative and neutral words
in the sentence and classifies it depending on which polarity is more represented.
Words that do not appear in the lexicon are considered as neutral.
:param sentence: a sentence whose polarity has to be classified.
:param plot: if True, plot a visual representation of the sentence polarity.
"""
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank
tokenizer = treebank.TreebankWordTokenizer()
pos_words = 0
neg_words = 0
tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
x = list(range(len(tokenized_sent))) # x axis for the plot
y = []
for word in tokenized_sent:
if word in opinion_lexicon.positive():
pos_words += 1
y.append(1) # positive
elif word in opinion_lexicon.negative():
neg_words += 1
y.append(-1) # negative
else:
y.append(0) # neutral
if pos_words > neg_words:
print('Positive')
elif pos_words < neg_words:
print('Negative')
elif pos_words == neg_words:
print('Neutral')
if plot == True:
_show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
def handle_multiple_sentences(infile, outfile):
titles = []
f = open(infile, "r")
f2 = codecs.open(outfile, "w+", "utf-8")
for line in f:
line = line.decode("utf-8")
sentences = sent_detector.tokenize(line.strip())
for i in range(len(sentences)):
if i == 0:
sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
else:
sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
sentences[i-1] = sentences[i-1].replace(sentences[i-1].split()[-1][-1], " ::::")
titles.append(" ".join(sentences))
title_set = set(titles)
for l in title_set:
print >> f2, l
preprocessed_data.py 文件源码
项目:diversity_based_attention
作者: PrekshaNema25
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def preprocess(s, max_tokens):
#s = unicode(s, ignore="errors")
s = s.lower()
s = re.sub(r'[^\x00-\x7F]+',' ', s)
s = re.sub("<s>", "", s)
s = re.sub("<eos>", "", s)
s = remove_punctuation(s)
s = re.sub('\d','#',s)
s = re.sub('\n',' ',s)
s = re.sub(',',' ',s)
tokens = WhitespaceTokenizer().tokenize(s)
#s = replace_the_unfrequent(tokens)
if (len(tokens) > max_tokens):
tokens = tokens[:max_tokens]
s = " ".join(tokens)
return s, len(tokens)
def analysis(reviews_collection_text):
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
raw_data = f.read()
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
comments = f.readlines()
data = raw_data.replace('\n', ' ')
data_lower = data.lower()
tokens_with_punc = word_tokenize(data_lower)
tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
print("--- Most frequent tokens ---\n",
FreqDist(tokens_with_punc).most_common(15))
print("--- Tokens without punctuation ---\n",
FreqDist(tokens).most_common(15))
stop = set(stopwords.words('english'))
words = [word for word in tokens if word not in stop]
print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
tagged = pos_tag(words)
nouns = [word for word, pos in tagged if (pos == 'NN')]
print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
adjts = [word for word, pos in tagged if (pos == 'JJ')]
print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
avgld = sum(lxdst) / len(comments)
print("--- Average lexical density ---\n", avgld)
def parts_of_speechtag(self, sentences=""):
from nltk.corpus import state_union # for importing the already stored data, to be trained with
from nltk.tokenize import PunktSentenceTokenizer # importing the already POS intelligent punkbuster tokenizer
training_text = state_union.raw("2005-GWBUSH.txt") # Training set imported from the state union local repo.
sample_text = sentences
custom_sentence_tokenized = PunktSentenceTokenizer(train_text=training_text)
# This is the unsupervised learning
tokenization_unsupervised = custom_sentence_tokenized.tokenize(str(sample_text))
# tokenizing using unsupervised learning
# print(tokenization_unsupervised) # just for hedebuggin purposes
# print(type(tokenization_unsupervised)) # checking the type of the sentences
self.processing_POS_tokenization(tokenization_unsupervised=tokenization_unsupervised)
# Calling the Process content
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
def normalize(self, text):
return [self.stemmer.stem(token)
for token in self.tokenizer.tokenize(text.lower())
if token not in self.stop_words]
######### defining a default normalizer ##########
def normalize(self, text):
return [token for token in self.tokenizer.tokenize(text.lower())
if token not in self.stop_words]
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def get_tweet_tags(tweet):
""" Break up a tweet into individual word parts """
tknzr = TweetTokenizer()
tokens = tknzr.tokenize(tweet)
# replace handles with real names
for n, tok in enumerate(tokens):
if tok.startswith('@'):
handle = tok.strip("@")
if handle in user.students:
# If we have a database entry for the mentioned user, we can
# easily substitute a full name.
usr = user.NPUser(handle)
tokens[n] = usr.fullname
else:
# If there is no database entry, we use the user's alias. While
# this is the full name in many cases, it is often not reliable
usr = api.get_user(handle)
tokens[n] = usr.name
tagged = nltk.pos_tag(tokens)
# In nltk, if a teacher's name is written with a period after an
# abbreviated prefix, it is awkwardly broken up into 3 tags
for n, tag in enumerate(tagged):
# If there is the weird period after the prefix,
if tag[1] == '.':
# and it is in fact splitting up a person's name,
if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
# combine it into the actual name,
tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
tagged[n + 1][0]), 'NNP')
# and then remove the extra tags.
del tagged[n + 1]
del tagged[n]
return tagged
def tokenize(data):
sent_tokenize = nltk.tokenize.sent_tokenize
tokenizer = nltk.tokenize.RegexpTokenizer(u"[\s\.,-?!'\"??\d·•—()׫»%\[\]|?*]+", gaps=True)
word_tokenize = tokenizer.tokenize
for text, blockname, textname in data:
sentences = sent_tokenize(text.strip())
for sentence in sentences:
words = word_tokenize(sentence)
for word in words:
if len(word) > 1:
yield (word, sentence, blockname, textname)
def tokenize(self, text):
"""
tokenize text into a list of Token objects
:param text: text to be tokenized (might contains several sentences)
:type text: str
:return: List of Token objects
:rtype: list(Token)
"""
tokens = []
if self.tokenizer_type == "SpaceTokenizer":
operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
for counter, span in enumerate(operator.span_tokenize(text)):
new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
tokens.append(new_token)
elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer":
operator = WhitespaceTokenizer()
for counter, span in enumerate(operator.span_tokenize(text)):
new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
tokens.append(new_token)
elif self.tokenizer_type == "PTBTokenizer":
ptb_tokens = word_tokenize(text)
counter = 0
for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens):
new_token = Token(counter, token, span[0], span[1])
counter += 1
tokens.append(new_token)
return tokens
def __tokenizeWords(sentence):
return nltk.tokenize.word_tokenize(sentence)
## tests ########################################################################################
def __init__(self, itemId, questionType, answerType, question, answer, V, WordIDMap):
self.itemId = itemId
self.questionType = questionType
self.answerType = answerType
self.question = question
self.answer = answer
self.Question = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(question) if stemmer.stem(word) in WordIDMap]
self.Answer = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(answer) if stemmer.stem(word) in WordIDMap]
self.qFeature = {}
self.aFeature = {}
self.create_QAFeature()
def __init__(self, itemId, Review, V, WordIDMap, ReviewObj):
self.itemId = itemId
self.sent = Review
self.rObj = ReviewObj
self.Sent = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(Review) if stemmer.stem(word) in WordIDMap]
self.sFeature = {}
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def sents(self, fileids=None, categories=None):
"""
Uses the built in sentence tokenizer to extract sentences from the
paragraphs. Note that this method uses BeautifulSoup to parse HTML.
"""
for paragraph in self.paras(fileids, categories):
for sentence in self._sent_tokenizer.tokenize(paragraph):
yield sentence
def words(self, fileids=None, categories=None):
"""
Uses the built in word tokenizer to extract tokens from sentences.
Note that this method uses BeautifulSoup to parse HTML content.
"""
for sentence in self.sents(fileids, categories):
for token in self._word_tokenizer.tokenize(sentence):
yield token
def describe(self, fileids=None, categories=None):
"""
Performs a single pass of the corpus and returns a dictionary with a
variety of metrics concerning the state of the corpus.
"""
# Structures to perform counting.
counts = nltk.FreqDist()
tokens = nltk.FreqDist()
started = time.time()
# Perform single pass over paragraphs, tokenize and count
for para in self.paras(fileids, categories):
counts['paras'] += 1
for sent in self._sent_tokenizer.tokenize(para):
counts['sents'] += 1
for word in self._word_tokenizer.tokenize(sent):
counts['words'] += 1
tokens[word] += 1
# Compute the number of files and categories in the corpus
n_fileids = len(self._resolve(fileids, categories) or self.fileids())
n_topics = len(self.categories(self._resolve(fileids, categories)))
# Return data structure with information
return {
'files': n_fileids,
'topics': n_topics,
'paras': counts['paras'],
'sents': counts['sents'],
'words': counts['words'],
'vocab': len(tokens),
'lexdiv': float(counts['words']) / float(len(tokens)),
'ppdoc': float(counts['paras']) / float(n_fileids),
'sppar': float(counts['sents']) / float(counts['paras']),
'secs': time.time() - started,
}