def tweet_connotation(tweet):
""" Decide whether a tweet is generally positive or negative """
anlyzr = SentimentIntensityAnalyzer()
# break tweet up into sentences and analyze each seperately
twtcontent = sent_tokenize(tweet)
overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}
for s in twtcontent:
scores = anlyzr.polarity_scores(s)
# tally up each sentence's overall tone
for i, z in enumerate(scores):
overall[z] += scores[z]
# average it all together for the tweet as a whole
for v in overall:
overall[v] = round(overall[v] / len(twtcontent), 3)
return overall
python类sent_tokenize()的实例源码
def tokenize_into_opinion_units(text):
output = []
for str in sent_tokenize(text):
for output_str in str.split(' but '):
output.append(output_str)
return output
#Take positive.csv and negative.csv and mix them into
#positiveandnegative.csv
#This has each unit tagged with its booking.com sentiment
#This is the data I tagged with Mechanical Turk
def ask_confirmation(self,best_matching_action):
alternative_formulations = sent_tokenize(self.trigger_dict[best_matching_action])
alternative_formulation = choice(alternative_formulations)
self.speak("Excuse me, I didn't understand your request very well. Do you want me to "+alternative_formulation)
answer = self.active_listen()
if "no" in answer:
self.speak("Please reformulate your request.")
return 0
if "yes" in answer:
self.speak("Very good")
return 1
def ask_confirmation(self,best_matching_action):
alternative_formulations = sent_tokenize(self.trigger_dict[best_matching_action])
alternative_formulation = choice(alternative_formulations)
self.speak("Excuse me, I didn't understand your request very well. Do you want me to "+alternative_formulation)
answer = self.active_listen()
if "no" in answer:
self.speak("Please reformulate your request.")
return 0
if "yes" in answer:
self.speak("Very good")
return 1
def stem_and_tokenize_text(text):
sents = sent_tokenize(text)
tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
terms = [Term(token) for token in tokens]
return filter(lambda term: not term.is_punctuation(), terms)
def convert_text2bin1(docs, writer):
global counter
for i, fi in enumerate(docs):
with open(os.path.join(curdir,"input","cnn","stories",fi),'r', encoding="UTF-8") as f:
wholetext=f.read().lower()
wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
wholetext=wholetext.replace("."," . ")
wholetext=wholetext.replace(","," , ")
wholetext=wholetext.replace('-',' - ')
wholetext=wholetext.replace('?',' ? ')
wholetext=wholetext.replace('(','( ')
wholetext=wholetext.replace(')',' )')
data=wholetext.split("@highlight")
news=data[0]
highlights=data[1].replace('\n\n','')
news=(" ".join(news.split('\n\n'))).strip()
sentences = sent_tokenize(news)
news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
words = (news+" "+highlights).split()
counter.update(words)
tf_example = example_pb2.Example()
tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
if i%3000==0:
print(int((float(i)/ len(docs))*100), "%")
print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")
def convert_text2bin2(docs, writer):
global counter
for i, fi in enumerate(docs):
with open(os.path.join(curdir,"input","dailymail","stories",fi),'r', encoding="UTF-8") as f:
wholetext=f.read().lower()
wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
wholetext=wholetext.replace("."," . ")
wholetext=wholetext.replace(","," , ")
wholetext=wholetext.replace('-',' - ')
wholetext=wholetext.replace('?',' ? ')
wholetext=wholetext.replace('(','( ')
wholetext=wholetext.replace(')',' )')
data=wholetext.split("@highlight")
news=data[0]
try:
news=news.split("updated:")[1]
news=news[news.find('20')+4:]
except:
None
news=(" ".join(news.split('\n'))).strip()
highlights=data[1].replace('\n\n','')
news=(" ".join(news.split('\n\n'))).strip()
sentences = sent_tokenize(news)
news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
words = (news+" "+highlights).split()
counter.update(words)
tf_example = example_pb2.Example()
tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
if i%3000==0:
print(int((float(i)/ len(docs))*100), "%")
print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")
def text_cleaner(data):
paragraphs_ = ""
try:
keep_endings = ['.', '?']
removals_ = open(join(settings.BASE_DIR, "aggregator", 'data', 'stop_sentences.txt'), 'r')
removals = [r.replace('\n', '') for r in removals_]
if not (data is None):
text = data.split('\n')
paragraphs = []
for p in text:
if len(p) > settings.MINIMUM_PARAGRAPH:
paragraphs.append(p)
for p in paragraphs:
sentence_tokens = sent_tokenize(p)
paragraph = ""
for sentence in sentence_tokens:
if sentence[-1] in keep_endings:
if len(sentence) > settings.MINIMUM_SENTENCE:
#should remove most of the code:
if sentence[0].isupper():
if not any(to_remove in sentence for to_remove in removals):
#eliminate some bad ending strings:
if not sentence.endswith(('e.g.', 'i.e.')):
paragraph += "{0} ".format(sentence)
paragraphs_ += "<p>{0}</p>".format(paragraph)
except Exception as e:
print(colored.red("At text_cleaner {}".format(e)))
return paragraphs_
def write_paragraph_lines(paragraph_lines):
paragraph_str = ' '.join(paragraph_lines)
for sent in sent_tokenize(paragraph_str):
if lowercase:
sent = sent.lower()
output_file.write(' '.join(word_tokenize(sent))+'\n')
def extract_target_context(self, paragraph, isolate_target_sentence):
if isolate_target_sentence:
for sent in sent_tokenize(paragraph):
words, position = self.extract_context(sent)
if words is not None:
break
else:
words, position = self.extract_context(paragraph)
return words, position
def doc_to_ids(self, doc, training=True):
l = []
words = dict()
doc_sents = sent_tokenize(doc)
for sentence in doc_sents:
miniArray = []
for term in sentence.split():
id = self.term_to_id(term, training)
if id != None:
miniArray.append(id)
if not id in words:
words[id] = 1
self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
l.append(np.array(miniArray, dtype=np.int32))
return l
def text2sentences(text):
'''Tokenize text into sentence tokens.'''
content = '\n'.join([open(f).read() for f in text])
sentences = []
try:
sentences = sent_tokenize(content)
except LookupError as err:
click.echo(message="Error with tokenization", nl=True)
click.echo(message="Have you run \"textkit download\"?", nl=True)
click.echo(message="\nOriginal Error:", nl=True)
click.echo(err)
[output(s.strip()) for s in sentences]
def make_phrases(self, start = 1, end = None):
if not end: end = start + 1
for chain_len in range(start, end): # +1 because of the way range works
self.phrases[chain_len] = []
for f in self.everything['input']:
for line in sent_tokenize( self.everything['input'][f] ):
words = word_tokenize(line)
for chain in self._make_chains(words, chain_len):
try:
# print "ERROR.0:", chain
chain = chain[:-1] # drop last item in chain as it's "value" for markov
chain = [c for c in chain if c is not None] # quick clean as None is breaking join
except:
print "ERROR.1:", chain
# sys.exit(-1)
# print chain_len, " => ", chain
try:
self.phrases[chain_len].append(" ".join(chain) )
except:
print "ERROR.2:", chain
sys.exit(-1)
return Counter( self.phrases[chain_len] )
def buildGraph(text):
vertices = []
sentences = sent_tokenize(text, language='english')
for sentence_raw in sentences:
sentence_processed = sub("[^a-zA-Z ]+", '', sentence_raw).lower()
words = word_tokenize(sentence_processed, language='english')
vertices.append(vertex(sentence_raw, sentence_processed, words))
for v1 in vertices:
for v2 in vertices:
if v1.order != v2.order:
v1.scores.append(overlap(v1.words, v2.words))
v1.averageScores()
return vertices
def updateSentiment(dbLoc, tableName):
sid = SentimentIntensityAnalyzer()
conn = sqlite3.connect(dbLoc)
cursor = conn.execute("SELECT * from %s" % tableName)
# Go through every sentence
for row in cursor:
text = cleanTweet(row[TWEET_INDEX])
#blob = TextBlob(text)
sent = 0.0
count = 0
sentList = tokenize.sent_tokenize(text)
# Go through each sentence in tweet
for sentence in sentList:
count += 1
ss = sid.polarity_scores(sentence)
sent += ss['compound'] # Tally up the overall sentiment
if count != 0:
sent = float(sent / count)
# Update into DB
conn.execute("UPDATE " + tableName + " set SENTIMENT = ? where ID = ?", \
(sent, row[ID_INDEX]))
conn.commit()
conn.close()
def getSentiment(tweet):
sid = SentimentIntensityAnalyzer()
tweet = cleanTweet(tweet)
sent = 0.0
count = 0
sentList = tokenize.sent_tokenize(tweet)
# Go through each sentence in tweet
for sentence in sentList:
count += 1
ss = sid.polarity_scores(sentence)
sent += ss['compound'] # Tally up the overall sentiment
if count != 0:
sent = float(sent / count)
return sent
# Update the sentiment
def _preprocess(self, text):
""" Return a list of lists. Each list is a preprocessed sentence of
text in bag-of-words format."""
stemmer = PorterStemmer()
self._sents = sent_tokenize(text)
# tokenize sentences
word_sents = [word_tokenize(sent.lower()) for sent in self._sents]
# remove stop-words and stem words
word_sents = [[stemmer.stem(word) for word in sent if
word not in self._stopwords] for sent in word_sents]
return word_sents
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
print "text_to_sentence"
#from nltk.tokenize import wordpunct_tokenize
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
text=text.decode("utf8")
from nltk.tokenize import sent_tokenize,wordpunct_tokenize
# 1. Use the NLTK tokenizer to split the paragraph into sentences
#raw_sentences = tokenizer.tokenize(text.strip())
raw_sentences = sent_tokenize(text.strip())
print "finish tokenize sentence",len(raw_sentences)
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
#print "sentence:",raw_sentence
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
#sentences.append( text_to_wordlist( raw_sentence, \
# remove_stopwords ))
#print removePunctuation(raw_sentence).lower().split()
print raw_sentence
sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
print wordpunct_tokenize(raw_sentence)
#print text_to_wordlist( raw_sentence, remove_stopwords )
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
def create_example(text):
raw_sentences = sent_tokenize(text)
sentences = [word_tokenize(s) for s in raw_sentences]
speakers = [["" for _ in sentence] for sentence in sentences]
return {
"doc_key": "nw",
"clusters": [],
"sentences": sentences,
"speakers": speakers,
}
def getSentences(paragraph):
"""
Extracts sentences from a paragraph
:param paragraph: (str) paragraph text
:returns: list of sentences
"""
indexed = {}
i = 0
sentenceList = tokenize.sent_tokenize(paragraph)
for s in sentenceList:
indexed[i] = s
i += 1
return sentenceList, indexed