def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
python类word_tokenize()的实例源码
def train(self, chain_len = None):
""" Trains the markov data structure by creating chains of desired length """
if not chain_len:
chain_len = self.CHAIN_LENGTH
self.CHAIN_LEN = chain_len
self.everything['corpus'] = {}
self.corpus = self.everything['corpus']
for f in self.everything['input']:
for line in sent_tokenize( self.everything['input'][f] ):
words = word_tokenize(line)
for chain in self._make_chains(words):
k = " ".join( chain[:-1] ) # key is everything but last word
v = chain[-1] # value is last word
try:
self.corpus[k].append(v)
except:
self.corpus[k] = [v]
def parse_gender(text):
sentences = [
[word.lower() for word in nltk.word_tokenize(sentence)]
for sentence in nltk.sent_tokenize(text)
]
sents, words = count_gender(sentences)
total = sum(words.values())
for gender, count in words.items():
pcent = (count / total) * 100
nsents = sents[gender]
print(
"{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
)
def _convert_obj(self, obj):
pre_sentence = obj['sentence1']
hyp_sentence = obj['sentence2']
if self.lower:
pre_sentence = pre_sentence.lower()
hyp_sentence = hyp_sentence.lower()
pre_words = word_tokenize(pre_sentence)
hyp_words = word_tokenize(hyp_sentence)
pre = [self.word_vocab.word_to_id(w) for w in pre_words]
hyp = [self.word_vocab.word_to_id(w) for w in hyp_words]
pre_length = len(pre)
hyp_length = len(hyp)
label = obj['gold_label']
if len(pre) > self._max_length or len(hyp) > self._max_length:
return None
if label == '-':
return None
label = self.label_vocab.word_to_id(label)
return pre, hyp, pre_length, hyp_length, label
def tokenize_me(file_text):
#firstly let's apply nltk tokenization
tokens = nltk.word_tokenize(file_text)
#let's delete punctuation symbols
tokens = [i for i in tokens if i not in string.punctuation]
#deleting stop_words
tokens = [i for i in tokens if i not in stop_words]
#cleaning words
tokens = [i.replace("«", "").replace("»", "") for i in tokens]
tokens = [stemmer.stem(i) for i in tokens]
return set(tokens)
def tokenize_sentences(self):
# tokenize the sentences into words and count the word frequencies
# get most common words, build index_to_word and word_to_index vectors
self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
self.sentences]
word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
print("Found %d unique word tokens." % len(word_freq.items()))
vocab = word_freq.most_common(self.vocabulary_size - 1)
self.index_to_word = [x[0] for x in vocab]
self.index_to_word.append(self.unknown_token)
self.word_to_index = dict(
[(w, i) for i, w in enumerate(self.index_to_word)])
print("Using vocabulary size %d." % self.vocabulary_size)
print(
"The least frequent word is '%s' appearing %d times." % (
vocab[-1][0], vocab[-1][1]))
# replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(self.tokenized_sentences):
self.tokenized_sentences[i] = [
w if w in self.word_to_index else self.unknown_token for w in
sent]
def sent2vec(s):
words = str(s).lower()
words = word_tokenize(words)
words = [w for w in words if not w in stop_words]
words = [w for w in words if w.isalpha()]
M = []
for w in words:
try:
M.append(model[w])
except:
continue
M = np.array(M)
v = M.sum(axis=0)
return v / np.sqrt((v ** 2).sum())
###############################################################################
# Train
def get_review_sentences():
'''
Read the yelp review and return after sentence segmentattion
:return:
'''
review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8')
count_sentence = 0
sentences = []
for line in review_file:
json_review = json.loads(line.strip())
text = json_review.get("text").replace('\n','').lower()
raw_sentences = sent_tokenize(text)
for raw_sentence in raw_sentences:
if len(raw_sentence.strip()) > 0:
sent_tokens = word_tokenize(raw_sentence)
sentences.append(sent_tokens)
return sentences
def createTrainingList(reviewLst):
sds = SupervisedDataSet(100,1)
for review in reviewLst:
revString = unicode(review[1], errors='ignore')
revSentences = nltk.word_tokenize(revString.strip())
revWords = []
for i in revSentences:
revWords += i.lower().split()
vec = 0
for i in revWords:
try:
vec+=model[i]/2
except:
pass
vec=vec/len(revWords)
sds.addSample(vec,review[0])
net = buildNetwork(100, 20, 1, hiddenclass=TanhLayer, outclass=SoftmaxLayer,bias=True)
trainer = BackpropTrainer(net, sds)
print "Error score:",trainer.train()
print trainer.trainUntilConvergence(verbose=True,maxEpochs=100)
def token_func(input_string):
tokens = nltk.word_tokenize(input_string)
long_tokens = []
refined_tokens = []
# lemmatized_tokens = []
stopwordlist = get_stopwordlist("../data/first_stopwordlist.txt")
regex = re.compile('[^1-9a-zA-Z]')
for token in tokens:
token = regex.sub('', token)
if len(token) > 3:
long_tokens.append(token)
lemmatized_tokens = dhh_preprocess_tools.hfst_words(long_tokens,
filter=('VERB',
'NOUN',
'ADJ',
'PROPN'))
for token in lemmatized_tokens:
token = token.lower()
if token not in stopwordlist:
refined_tokens.append(token)
return refined_tokens
AKE.py 文件源码
项目:NLP-Keyword-Extraction-Ensemble-Method
作者: Ashwin-Ravi
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group).lower()
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates
if cand not in stop_words and not all(char in punct for char in cand)]
textcat.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def txt_to_sent(sentences, word_vec, tokenize=True):
sentences = [['<s>']+s.split()+['</s>'] if not tokenize else ['<s>']+nltk.word_tokenize(s)+['</s>'] for s in sentences]
n_w = np.sum([len(x) for x in sentences])
# filters words without glove vectors
for i in range(len(sentences)):
s_f = [word for word in sentences[i] if word in word_vec]
if not s_f:
import warnings
warnings.warn('No words in "{0}" (idx={1}) have glove vectors. Replacing by "</s>"..'.format(sentences[i], i))
s_f = ['</s>']
sentences[i] = s_f
lengths = np.array([len(s) for s in sentences])
n_wk = np.sum(lengths)
print('Nb words kept : {0}/{1} ({2} %)'.format(n_wk, n_w, round((100.0 * n_wk) / n_w, 2)))
return sentences
def __init__(self, text):
self.text = text
self.tokens = nltk.word_tokenize(text)
self.lowercase_tokens = [t.lower() for t in self.tokens]
self.alpha_tokens = [t for t in self.lowercase_tokens if t.isalpha()]
sent-thoughts-parse.py 文件源码
项目:Deep-Learning-with-Keras
作者: PacktPublishing
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def maybe_build_vocab(reuters_dir, vocab_file):
vocab = collections.defaultdict(int)
if os.path.exists(vocab_file):
fvoc = open(vocab_file, "rb")
for line in fvoc:
word, idx = line.strip().split("\t")
vocab[word] = int(idx)
fvoc.close()
else:
counter = collections.Counter()
num_docs_read = 0
for doc in stream_reuters_documents(reuters_dir):
if num_docs_read % 100 == 0:
print("building vocab from {:d} docs"
.format(num_docs_read))
topics = doc["topics"]
if len(topics) == 0:
continue
title = doc["title"]
body = doc["body"]
title_body = ". ".join([title, body]).lower()
for sent in nltk.sent_tokenize(title_body):
for word in nltk.word_tokenize(sent):
counter[word] += 1
for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
vocab[c[0]] = i + 1
num_docs_read += 1
print("vocab built from {:d} docs, complete"
.format(num_docs_read))
fvoc = open(vocab_file, "wb")
for k in vocab.keys():
fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
fvoc.close()
return vocab
sent-thoughts-parse.py 文件源码
项目:Deep-Learning-with-Keras
作者: PacktPublishing
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def build_numeric_text(vocab, text):
wids = []
for sent in nltk.sent_tokenize(text):
for word in nltk.word_tokenize(sent):
wids.append(vocab[word])
return ",".join([str(x) for x in wids])
##################### main ######################
def get_maxlens(train_data, test_data):
story_maxlen, question_maxlen = 0, 0
for stories, questions, _ in [train_data, test_data]:
for story in stories:
story_len = 0
for sent in story:
swords = nltk.word_tokenize(sent)
story_len += len(swords)
if story_len > story_maxlen:
story_maxlen = story_len
for question in questions:
question_len = len(nltk.word_tokenize(question))
if question_len > question_maxlen:
question_maxlen = question_len
return story_maxlen, question_maxlen
def vectorize(data, word2idx, story_maxlen, question_maxlen):
Xs, Xq, Y = [], [], []
stories, questions, answers = data
for story, question, answer in zip(stories, questions, answers):
xs = [[word2idx[w.lower()] for w in nltk.word_tokenize(s)]
for s in story]
xs = list(itertools.chain.from_iterable(xs))
xq = [word2idx[w.lower()] for w in nltk.word_tokenize(question)]
Xs.append(xs)
Xq.append(xq)
Y.append(word2idx[answer.lower()])
return pad_sequences(Xs, maxlen=story_maxlen),\
pad_sequences(Xq, maxlen=question_maxlen),\
np_utils.to_categorical(Y, num_classes=len(word2idx))
QnARecurAtteLatest2GRU.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def tokenize(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
return [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
QnARecurAtteLatest3Atten.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def tokenize(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
return [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]