def tokenize(text):
# text = NB.remove_punctuation(text)
try:
text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
except:
text = text.encode('ascii', 'replace').strip().lower()
word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't
biword = [b for b in nltk.bigrams(word)]
triword = [t for t in nltk.trigrams(word)]
# word = [w for w in word if w not in stopwords.words('english')]
return word # triword
python类bigrams()的实例源码
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
def createbigramvocabulary(reviewfile, vocabfile):
createvocabulary(reviewfile, vocabfile)
finput = open(reviewfile,"r")
foutput = open(vocabfile,"a")
all_bigrams = []
for line in finput:
tokenized_line = []
tokenized_line.append('*')
tokenized_line.extend(word_tokenize(line[1:]))
tokenized_line.append('$')
bgrms = bigrams(tokenized_line)
all_bigrams.extend(bgrms)
c = Counter(all_bigrams)
for b in c:
if (b[0] != "+" and b[0] != "-" and c[b] >= 3):
foutput.write(b[0] + " " + b[1] + "\n")
finput.close()
foutput.close()
util.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
def posTrigramsScore(trigrams,category,pos_tags_trigrams,labels):
#keep pos tags bigrams of specific category
trigrams_category = subList(pos_tags_trigrams,labels,category)
#initialize dictionary
d = {}
#calculate score for every bigram
for trigram in trigrams:
d[trigram] = score(trigram,category,trigrams_category,pos_tags_trigrams)
return d
#calculate bigram's f1 score
def __init__(self,lexicon):
#initialize two dictionaries (unigrams and bigrams)
self.d_unigrams = {}
self.d_bigrams = {}
#select which lexicon to load
if lexicon == 0 :
self.loadHashtagLexicon1()
elif lexicon == 1:
self.loadHashtagLexicon2()
elif lexicon == 2 :
self.loadMaxDiffTwitterLexicon()
elif lexicon == 3 :
self.loadSentiment140Lexicon1()
elif lexicon == 4 :
self.loadSentiment140Lexicon2()
elif lexicon == 5 :
self.loadEmotionLexicon()
else:
print "Lexicon unavailable, please load another one"
#HashtagSentimentAffLexNegLex
def loadUnigrams(self,path,reverse=False):
f = open(path)
for line in f.readlines():
line = line.decode('utf8')
key = line.split("\t")[0]
value = line.split("\t")[1]
if reverse:
self.d_unigrams[value]=float(key)
else:
self.d_unigrams[key]=float(value)
f.close()
#load bigrams lexicon
def score(self,tokens):
total = 0.0
#score for unigrams
for token in tokens:
total += self.d_unigrams.get(token,0.0)
#score for bigrams, if bigrams exist
if len(self.d_bigrams)>0 :
#list with bigrams of the message
bigrams_list = Counter(list(bigrams(tokens))).keys()
for bigram in bigrams_list :
total += self.d_bigrams.get(bigram,0.0)
return total
#compute the number of tokens(words) that appear in the lexicon
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
def getTrigramsSet(pos_bigrams):
s = set()
for x in pos_bigrams:
for bigram in x:
s.add(bigram)
return list(s)
#calculate bigrams of every item of the list l
def getBigrams(l):
b = []
for x in l:
b.append(list(bigrams(x)))
return b
#calculate trigrams of every item of the list l
def posBigramsScore(bigrams,category,pos_tags_bigrams,labels):
#keep pos tags bigrams of specific category
bigrams_category = subList(pos_tags_bigrams,labels,category)
#initialize dictionary
d = {}
#calculate score for every bigram
for bigram in bigrams:
d[bigram] = score(bigram,category,bigrams_category,pos_tags_bigrams)
return d
#calculate pos trigram score
def loadHashtagLexicon2(self):
folder = "NRC-Hashtag-Sentiment-Lexicon-v0.1/"
file1 = "unigrams-pmilexicon.txt"
file2 = "bigrams-pmilexicon.txt"
#clear previous dictionaries
self.clearDictionaries()
#load unigrams
self.loadUnigrams(NRCLexicon.directory+folder+file1)
#load bigrams
self.loadBigrams(NRCLexicon.directory+folder+file2)
#MaxDiff-Twitter-Lexicon
def loadMaxDiffTwitterLexicon(self):
folder = "MaxDiff-Twitter-Lexicon/"
file1 = "Maxdiff-Twitter-Lexicon_-1to1.txt"
#clear previous dictionaries
self.clearDictionaries()
#load unigrams - reverse = true due to the .txt file format
self.loadUnigrams(NRCLexicon.directory+folder+file1,True)
#this lexicon has no bigrams so d_bigrams remains empty
#Sentiment140AffLexNegLex
def loadSentiment140Lexicon1(self):
folder = "Sentiment140AffLexNegLex/"
file1 = "S140-AFFLEX-NEGLEX-unigrams.txt"
file2 = "S140-AFFLEX-NEGLEX-bigrams.txt"
#clear previous dictionaries
self.clearDictionaries()
#load unigrams
self.loadUnigrams(NRCLexicon.directory+folder+file1)
#load bigrams
self.loadBigrams(NRCLexicon.directory+folder+file2)
#Sentiment140-Lexicon-v0.1
def loadSentiment140Lexicon2(self):
folder = "Sentiment140-Lexicon-v0.1/"
file1 = "unigrams-pmilexicon.txt"
file2 = "bigrams-pmilexicon.txt"
#clear previous dictionaries
self.clearDictionaries()
#load unigrams
self.loadUnigrams(NRCLexicon.directory+folder+file1)
#load bigrams
self.loadBigrams(NRCLexicon.directory+folder+file2)
#NRC-Emotion-Lexicon-v0.92
def words2bigrams(sep, tokens):
'''Tokenize words into bigrams. Bigrams are two word tokens.
Punctuation is considered as a separate token.'''
content = read_tokens(tokens)
bigrams = []
try:
bigrams = list(nltk.bigrams(content))
except LookupError as err:
click.echo(message="Error with tokenization", nl=True)
click.echo(message="Have you run \"textkit download\"?", nl=True)
click.echo(message="\nOriginal Error:", nl=True)
click.echo(err)
[output(sep.join(bigram)) for bigram in bigrams]
def bigram_predict(testSet,PP,PN,positive_probabilities,negative_probabilities,unseen_pos_prob,unseen_neg_prob):
predicted_class = []
for review in testSet:
negative_probab = math.log10(PN)
positive_probab = math.log10(PP)
review_words = []
review_words.append('*')
review_words.extend(word_tokenize(review))
review_words.append('$')
review_bigrams = bigrams(review_words)
for w in review_bigrams:
bigram = w
w = w[0]+" " +w[1]
if w in negative_probabilities and w in positive_probabilities:
negative_probab = negative_probab + math.log10(negative_probabilities[w])
positive_probab = positive_probab + math.log10(positive_probabilities[w])
else:
if bigram[0] in negative_probabilities and bigram[0] in positive_probabilities:
#if(bigram[0] == '*'):
# negative_probab = negative_probab
# positive_probab = positive_probab
#else:
#if(negative_probabilities[bigram[0]] < 0 or positive_probabilities[bigram[0]] < 0):
# print("issue with " + bigram[0] + " " + str(negative_probabilities[bigram[0]]) + " " + str(positive_probabilities[bigram[0]]))
#if(negative_probabilities[bigram[0]] > 0 and positive_probabilities[bigram[0]] > 0):
negative_probab = negative_probab + math.log10(negative_probabilities[bigram[0]])
positive_probab = positive_probab + math.log10(positive_probabilities[bigram[0]])
else:
negative_probab = negative_probab + math.log10(unseen_neg_prob)
positive_probab = positive_probab + math.log10(unseen_pos_prob)
if(negative_probab > positive_probab):
result = '-'
else:
result = '+'
predicted_class.append(result)
return predicted_class
def get_valid_bigram_words(self, words):
_words = []
for i in nltk.bigrams(words):
if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len):
if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)):
if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)):
_words.append(" ".join(i))
return _words
def Markov_generate_unigram(seed):
seed = ''.join(seed)
counter = 1
next_word_list = []
for i in data:
if seed == i:
next_word_list.append(data[counter])
counter += 1
if len(next_word_list) == 0:
return nltk.bigrams(["you", "are"])
cfdist = nltk.FreqDist(next_word_list)
next_word = cfdist.max()
return nltk.bigrams([seed, next_word])
def Markov_generate_bigrams(tuples):
counter = 1
index_list = []
data_bigrams = nltk.bigrams(data)
for i in data_bigrams:
if tuples == i:
index_list.append(data[counter+1])
counter += 1
return index_list
def calc_cfd(doc):
# Calculate conditional frequency distribution of bigrams
words = [w for w, t in Mecab().pos(doc)]
bigrams = nltk.bigrams(words)
return nltk.ConditionalFreqDist(bigrams)
def converse(raw_sentence):
words_in_sent = raw_sentence.split()
if len(words_in_sent) > 1:
bigrams = nltk.bigrams(words_in_sent)
else:
bigrams = Markov_generate_unigram(words_in_sent)
text_len = 20
generated_lines = []
for tuples in bigrams:
line = []
line.append(''.join(tuples[0]).title()+" ")
line.append(''.join(tuples[1])+" ")
for i in range(text_len):
next_words = Markov_generate_bigrams(tuples)
if not next_words:
break
cfdist = nltk.FreqDist(next_words)
next_word = cfdist.max()
line.append(next_word+" ")
new_tuple = (tuples[1], next_word)
del tuples
tuples = new_tuple
generated_lines.append(line)
longest_line = ''
for line in generated_lines:
stri = ''.join(line)
if "." in stri:
truncate_char = "."
elif "?" in stri:
truncate_char = "?"
elif "!" in stri:
truncate_char = "!"
try:
stri = stri[:stri.index(truncate_char)]
except:
pass
if len(line) > len(longest_line):
longest_line = stri.strip()+"."
return longest_line