def create_lexicon(fin):
lexicon = []
with open(fin, 'r', buffering=100000, encoding ='latin-1') as f:
try:
counter = 1
content = ''
for line in f:
counter+=1
if(counter/2500.0).is_integer():
tweet=line.split(':::')[1]
content+= ' '+tweet
words = word_tokenize(content)
words = [lemmatizer.lemmatize(i) for i in words]
lexicon = list(set(lexicon + words))
print(counter, len(lexicon))
except Exception as e:
print(str(e))
with open('lexicon.pickle', 'wb') as f:
pickle.dump(lexicon, f)
python类word_tokenize()的实例源码
1.6 Mil.py 文件源码
项目:NLTK_SentimentAnalysis_TensorFlow
作者: rachit-mishra
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
1.6 Mil.py 文件源码
项目:NLTK_SentimentAnalysis_TensorFlow
作者: rachit-mishra
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def convert_to_vec(fin, fout, lexicon_pickle):
with open(lexicon_pickle, 'rb') as f:
lexicon = pickle.load(f)
outfile = open(fout, 'a')
with open(fin, buffering= 20000, encoding = 'latin-1') as f:
counter = 0
for line in f:
counter +=1
label = line.split(':::')[0]
tweet = line.split(':::')[1]
current_words = word_tokenize(tweet.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
features[index_value] +=1
features = list(features)
outline = str(features)+'::'+str(label)+ '\n'
outfile.write(outline)
print(counter)
TF_own_data_model.py 文件源码
项目:NLTK_SentimentAnalysis_TensorFlow
作者: rachit-mishra
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def sample_handling(sample, lexicon, classification):
featureset = [] # [1 0] pos sentiment [0 1] negative sentiment
with open(sample, 'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
current_words = word_tokenize(l.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
#print(features)
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
# like the example discussed earlier
features[index_value] += 1
features = list(features)
featureset.append([features, classification])
#print(featureset)
return featureset
def runprops_data(self, docs):
new_docs = []
for doc_name, doc in docs:
print 'Processing:', doc_name
doc_new = []
doc = self.props_exception(doc_name, doc)
for index, sent in enumerate(doc):
doc_new.append(' '.join(word_tokenize(sent)))
print index+1, doc_new[index]
triples = []
for i, sent in enumerate(doc_new):
try:
tmp_triples = self.props_parser.extract_triples([sent])
triples.append(tmp_triples)
except:
print('Error: failed for line %s' % (sent))
continue
parse_sents = create_trees(triples, doc_new)
sents = []
new_docs.append((doc_name, parse_sents))
return new_docs
question_answers1.py 文件源码
项目:NLP_question_answering_system_project
作者: Roshrini
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def wordMatch(question, line, storyPOS_dict):
wordsInAQuestion = word_tokenize(question)
rootsInAQuestion = set()
for word in wordsInAQuestion:
root = lancaster_stemmer.stem(word)
rootsInAQuestion.add(root)
if line in storyPOS_dict:
verbmatch_score = 0
rootmatch_score = 0
scoreOfALine = {}
for (word,tag) in storyPOS_dict[line]:
if 'V' in tag:
verb_root = lancaster_stemmer.stem(word)
if verb_root in rootsInAQuestion:
verbmatch_score = verbmatch_score + 6
else:
word_root = lancaster_stemmer.stem(word)
if word_root in rootsInAQuestion:
rootmatch_score = rootmatch_score + 3
scoreOfALine[line] = rootmatch_score + verbmatch_score
return rootmatch_score + verbmatch_score
useful_functions.py 文件源码
项目:scientific-paper-summarisation
作者: EdCo95
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def preprocess_sentence(sentence):
"""
Preprocesses a sentence, turning it all to lowercase and tokenizing it into words.
:param sentence: the sentence to pre-process.
:return: the sentence, as a list of words, all in lowercase
"""
sentence = sentence.lower()
return word_tokenize(sentence)
useful_functions.py 文件源码
项目:scientific-paper-summarisation
作者: EdCo95
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def create_paper_dictionaries(filename="", readin=True, paper=None):
"""
Creates the metadata data structures for a specific paper required to compute the extra features which are
appended to the sentence vector.
:param filename: the filename only, not the path, for the paper to create dictionaries for.
:return: a tuple of the metadata data structures for the paper.
"""
if readin and filename != "":
# Read the paper in as a dictionary, keys are sections and values are the section text
paper = read_in_paper(filename)
# Extract paper keyphrases
keyphrases = set(filter(None, " ".join(paper["KEYPHRASES"].lower().split("\n")).split(" ")))
# Get the paper's vocab
full_paper = " ".join([val for _, val in paper.iteritems()]).lower()
paper_words = word_tokenize(full_paper)
vocab = set(paper_words)
# Create a bag of words for the paper
paper_bag_of_words = defaultdict(int)
for word in paper_words:
paper_bag_of_words[word] += 1
# Get the title words
title_words = set([x.lower() for x in word_tokenize(paper["MAIN-TITLE"]) if x not in STOPWORDS])
return keyphrases, vocab, paper_bag_of_words, title_words
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def get_latitude(self, user_input):
"""
Returns the latitude extracted from the input.
"""
from nltk import tokenize
for token in tokenize.word_tokenize(user_input):
if 'latitude=' in token:
return re.sub('latitude=', '', token)
return ''
def get_longitude(self, user_input):
"""
Returns the longitude extracted from the input.
"""
from nltk import tokenize
for token in tokenize.word_tokenize(user_input):
if 'longitude=' in token:
return re.sub('longitude=', '', token)
return ''
def split_ingr(x):
wnl=WordNetLemmatizer()
cleanlist=[]
lst = x.strip('[]').split(',')
cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
return cleanlist
#remove low-information words from ingredients, could use more
def add_items(self, sentence_li):
"""Add new items to the tok2emb dictionary from a given text."""
for sen in sentence_li:
sent_toks = sent_tokenize(sen)
word_toks = [word_tokenize(el) for el in sent_toks]
tokens = [val for sublist in word_toks for val in sublist]
tokens = [el for el in tokens if el != '']
for tok in tokens:
if self.tok2emb.get(tok) is None:
self.tok2emb[tok] = self.fasttext_model[tok]
def sentiment(request):
open_file = open("wordfeature5k.pickle","rb")
word_features = pickle.load(open_file)
open_file.close()
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
open_file = open("naivebayesclassifier.pickle","rb")
classifier = pickle.load(open_file)
open_file.close()
sentence = request.POST['sentence']
result = classifier.classify(find_features(sentence))
if result == "positive":
return render(request, "home/index.html",{"sentence":sentence, "positive":"positive"})
elif result == "negative":
return render(request, "home/index.html",{"sentence":sentence, "negative":"negative"})
def word_seg_en(docs):
docs = [word_tokenize(sent) for sent in tqdm(docs)]
# show the progress of word segmentation with tqdm
'''docs_seg = []
print('docs size', len(docs))
for i in tqdm(range(len(docs))):
docs_seg.append(word_tokenize(docs[i]))'''
return docs
def word_seg_en(docs):
docs = [word_tokenize(sent) for sent in tqdm(docs)]
# show the progress of word segmentation with tqdm
'''docs_seg = []
print('docs size', len(docs))
for i in tqdm(range(len(docs))):
docs_seg.append(word_tokenize(docs[i]))'''
return docs
def get_word_dict(self, sentences, tokenize=True):
# create vocab of words
word_dict = {}
if tokenize:
from nltk.tokenize import word_tokenize
sentences = [s.split() if not tokenize else word_tokenize(s)
for s in sentences]
for sent in sentences:
for word in sent:
if word not in word_dict:
word_dict[word] = ''
word_dict['<s>'] = ''
word_dict['</s>'] = ''
return word_dict
def visualize(self, sent, tokenize=True):
if tokenize:
from nltk.tokenize import word_tokenize
sent = sent.split() if not tokenize else word_tokenize(sent)
sent = [['<s>'] + [word for word in sent if word in self.word_vec] +
['</s>']]
if ' '.join(sent[0]) == '<s> </s>':
import warnings
warnings.warn('No words in "{0}" have glove vectors. \
Replacing by "<s> </s>"..'.format(sent))
batch = Variable(self.get_batch(sent), volatile=True)
if self.use_cuda:
batch = batch.cuda()
output = self.enc_lstm(batch)[0]
output, idxs = torch.max(output, 0)
# output, idxs = output.squeeze(), idxs.squeeze()
idxs = idxs.data.cpu().numpy()
argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]
# visualize model
import matplotlib.pyplot as plt
x = range(len(sent[0]))
y = [100.0*n/np.sum(argmaxs) for n in argmaxs]
plt.xticks(x, sent[0], rotation=45)
plt.bar(x, y)
plt.ylabel('%')
plt.title('Visualisation of words importance')
plt.show()
return output, idxs
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def extractRawFrequencies(self, article):
# this method is similar to above but returns
# the raw freq.cies ( all word count)
text = article[0]
text = article[1]
sentences = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sentences]
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in self._stopwords:
freq[word] += 1
return freq
def split_words(self, sentence: str) -> List[Token]:
# Import is here because it's slow, and by default unnecessary.
from nltk.tokenize import word_tokenize
return [Token(t) for t in word_tokenize(sentence.lower())]