def tokenize_and_stem(text):
"""
First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
"""
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
if 'intern' == token:
token = ''
if 'student' == token:
token = ''
if 'and' == token:
token = ''
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
return stems
python类word_tokenize()的实例源码
readdata.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def preprocessing(text):
text = text.decode("utf8")
# tokenize into words
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
# remove stopwords
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# remove words less than three letters
tokens = [word for word in tokens if len(word) >= 3]
# lower capitalization
tokens = [word.lower() for word in tokens]
# lemmatize
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text= ' '.join(tokens)
return preprocessed_text
def ne_tagging(text):
chunked = ne_chunk(pos_tag(word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for i in chunked:
if type(i) == Tree:
current_chunk.append(" ".join([token for token, pos in i.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
def get_sentence_tokens(text):
'''
Given a text(review), return the token list of each sentence
:param text:
:return:
'''
sentences = sent_tokenize(text)
sent_tokens = []
for sentence in sentences:
sent_token = word_tokenize(sentence)
sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))]
sent_tokens.append(sent_token)
# remove stop words and short tokens
# stemming, experiment shows that stemming works nothing...
# if (stemming):
# stemmer = PorterStemmer()
# texts = [[ stemmer.stem(token) for token in text] for text in texts]
return sent_tokens
def createbigramvocabulary(reviewfile, vocabfile):
createvocabulary(reviewfile, vocabfile)
finput = open(reviewfile,"r")
foutput = open(vocabfile,"a")
all_bigrams = []
for line in finput:
tokenized_line = []
tokenized_line.append('*')
tokenized_line.extend(word_tokenize(line[1:]))
tokenized_line.append('$')
bgrms = bigrams(tokenized_line)
all_bigrams.extend(bgrms)
c = Counter(all_bigrams)
for b in c:
if (b[0] != "+" and b[0] != "-" and c[b] >= 3):
foutput.write(b[0] + " " + b[1] + "\n")
finput.close()
foutput.close()
def word_count(message, word):
"""
Computes the number of times a word appears in a message
(case-insensitive).
Args:
message: A Message object.
word: A string with no spaces.
Returns:
An int representing the number of times word (case-insensitive)
appears in the text of message split by spaces.
"""
if ' ' in word:
raise ValueError('word cannot contain spaces')
lowercase_tokens = [token.lower() for token in nltk.word_tokenize(message.text)]
return lowercase_tokens.count(word.lower())
def build_vocab(train_data, test_data):
counter = collections.Counter()
for stories, questions, answers in [train_data, test_data]:
for story in stories:
for sent in story:
for word in nltk.word_tokenize(sent):
counter[word.lower()] += 1
for question in questions:
for word in nltk.word_tokenize(question):
counter[word.lower()] += 1
for answer in answers:
for word in nltk.word_tokenize(answer):
counter[word.lower()] += 1
# no OOV here because there are not too many words in dataset
word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
word2idx["PAD"] = 0
idx2word = {v:k for k, v in word2idx.items()}
return word2idx, idx2word
QnARecurAtteLatest2GRU.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def tokenizeVal(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
tokenIdx2CharIdx = [None] * len(tokenizedSent)
idx = 0
token_idx = 0
while idx < len(sent) and token_idx < len(tokenizedSent):
word = tokenizedSent[token_idx]
if sent[idx:idx+len(word)] == word:
tokenIdx2CharIdx[token_idx] = idx
idx += len(word)
token_idx += 1
else:
idx += 1
return tokenizedSent, tokenIdx2CharIdx
QnARecurAtteLatest3Atten.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def tokenizeVal(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
tokenIdx2CharIdx = [None] * len(tokenizedSent)
idx = 0
token_idx = 0
while idx < len(sent) and token_idx < len(tokenizedSent):
word = tokenizedSent[token_idx]
if sent[idx:idx+len(word)] == word:
tokenIdx2CharIdx[token_idx] = idx
idx += len(word)
token_idx += 1
else:
idx += 1
return tokenizedSent, tokenIdx2CharIdx
QnARecurAtteLatest2Attenenhance.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def tokenizeVal(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
tokenIdx2CharIdx = [None] * len(tokenizedSent)
idx = 0
token_idx = 0
while idx < len(sent) and token_idx < len(tokenizedSent):
word = tokenizedSent[token_idx]
if sent[idx:idx+len(word)] == word:
tokenIdx2CharIdx[token_idx] = idx
idx += len(word)
token_idx += 1
else:
idx += 1
return tokenizedSent, tokenIdx2CharIdx
QnARecurAtteLatest2GRU1SATTE.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def tokenizeVal(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
tokenIdx2CharIdx = [None] * len(tokenizedSent)
idx = 0
token_idx = 0
while idx < len(sent) and token_idx < len(tokenizedSent):
word = tokenizedSent[token_idx]
if sent[idx:idx+len(word)] == word:
tokenIdx2CharIdx[token_idx] = idx
idx += len(word)
token_idx += 1
else:
idx += 1
return tokenizedSent, tokenIdx2CharIdx
QnARecurAtteLatest3Attenenhance.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def tokenizeVal(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
tokenIdx2CharIdx = [None] * len(tokenizedSent)
idx = 0
token_idx = 0
while idx < len(sent) and token_idx < len(tokenizedSent):
word = tokenizedSent[token_idx]
if sent[idx:idx+len(word)] == word:
tokenIdx2CharIdx[token_idx] = idx
idx += len(word)
token_idx += 1
else:
idx += 1
return tokenizedSent, tokenIdx2CharIdx
QnARecurAtteLatest.py 文件源码
项目:recurrent-attention-for-QA-SQUAD-based-on-keras
作者: wentaozhu
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def tokenizeVal(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
tokenIdx2CharIdx = [None] * len(tokenizedSent)
idx = 0
token_idx = 0
while idx < len(sent) and token_idx < len(tokenizedSent):
word = tokenizedSent[token_idx]
if sent[idx:idx+len(word)] == word:
tokenIdx2CharIdx[token_idx] = idx
idx += len(word)
token_idx += 1
else:
idx += 1
return tokenizedSent, tokenIdx2CharIdx
def tokenize_text( sample_text ):
global sequence_lengths
processed_text = []
if cfg.remove_punctuation:
cleaned = sample_text.lower().translate( t_table )
else:
cleaned = sample_text
if cfg.use_casual_tokenizer:
tokens = tknzr.tokenize( cleaned )
else:
tokens = nltk.word_tokenize( cleaned, language='english')
if cfg.remove_stopwords:
tokens = [w for w in tokens if not w in stopwords.words('english')]
sequence_lengths.append( len( tokens ) )
processed_text.extend( tokens )
return processed_text
def process_imdb(fname, setting):
labels, sentences = [], []
filename = setting + ".csv"
quota = [0,0]
if setting == 'test':
maxquota = 5000
else:
maxquota = 15000
with open(os.path.join(fname, filename), 'rb') as f:
csvreader = csv.reader(f)
for line in csvreader:
label = 0 if line[0] == "1" else 1
quota[label] += 1
if quota[label] > maxquota:
continue
sentence = line[2].replace("\"", "")
text = nltk.word_tokenize(sentence.decode('utf-8'))
labels.append(int(label))
sentences.append(text)
return sentences, labels
def tokenize(self, sentence):
"""
Given a string, tokenize it into words (with the conventional notion
of word).
Parameters
----------
sentence: str
The string to tokenize.
Returns
-------
tokenized_sentence: List[str]
The tokenized representation of the string, as a list of tokens.
"""
return nltk.word_tokenize(sentence.lower())
def add(self, filename, document):
"""
Add a document string to the index.
"""
# You can uncomment the following line to see the words found in each
# image.
# print("Words found in %s: %s" % (filename, document))
for token in [t.lower() for t in nltk.word_tokenize(document)]:
if token in self.stopwords:
continue
if token in ['.', ',', ':', '']:
continue
if self.stemmer:
token = self.stemmer.stem(token)
# Add the filename to the set associated with the token.
self.redis_token_client.sadd(token, filename)
# store the 'document text' for the filename.
self.redis_docs_client.set(filename, document)
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def _identify_pronoun(self, answer):
"""Calculate percentage of pronouns within answer
- Args:
answer(str): answer text
- Returns:
percentage(float): ratio of pronouns in answer
"""
text = nltk.word_tokenize(answer)
post = nltk.pos_tag(text)
pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
# init variables
num_pronouns = 0
num_terms = len(post)
percentage = 0
for k, v in post:
if v in pronoun_list:
num_pronouns += 1
percentage = float(num_pronouns) / num_terms
return percentage
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def _identify_pronoun2(self, sentence):
"""Calculate percentage of pronouns in the sentence that are in the answer
- Args:
sentence(str): question sentence
- Returns:
pronoun_in_sentence(list): pronouns in sentence
sentence_len(int): length of current sentence
"""
text = nltk.word_tokenize(sentence)
post = nltk.pos_tag(text)
pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
pronoun_in_sentence = []
sentence_len = len(post)
for k, v in post:
if v in pronoun_list:
pronoun_in_sentence.append(k)
return pronoun_in_sentence, sentence_len
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def _first_tagger_after_answer_span(self, question):
"""Get the first tagger after answer span
- Args:
question(string): string of current question
- Returns:
tagger(string): tagger of first term after span
"""
index = 0
text = nltk.word_tokenize(question)
post = nltk.pos_tag(text)
for idx, t in enumerate(post):
if t[0] == '_____':
index = idx + 1
break
try:
return post[index][1]
except IndexError:
return 'dummy'
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def _first_tagger_before_answer_span(self, question):
"""Get the first tagger before answer span
- Args:
question(string): string of current question
- Returns:
tagger(string): tagger of first term before span
"""
index = 0
text = nltk.word_tokenize(question)
post = nltk.pos_tag(text)
for idx, t in enumerate(post):
if t[0] == "_____":
index = idx - 1
break
try:
return post[index][1]
except IndexError:
return 'dummy'
def main():
parser = argparse.ArgumentParser()
parser.add_argument("env_data", type=str, help="Generated environment data filename in JSON format")
args = parser.parse_args()
print("-- Initialized environment")
env = SquadEnv(args.env_data)
context, question = env.reset()
done = False
while not done:
print("Context ids: {}".format(context))
print("Question ids: {}".format(question))
print("Context tokens: {}".format(ids2tokens(context, env.id2token)))
print("Question tokens: {}".format(ids2tokens(question, env.id2token)))
answer_tokens = tokens2ids(word_tokenize(input("Answer: ")) + ["#eos#"], env.token2id)
question_reward = 0
for token in answer_tokens:
(context, question), reward, done, _ = env.step(token)
question_reward += reward
print("You got {} reward".format(question_reward))
def LemNormalize(text):
# convert non ascii characters
text = text.encode('ascii', 'replace').decode()
# remove punctuation and digits
remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
transformed = text.lower().translate(remove_punct_and_digits)
# shortword = re.compile(r'\W*\b\w{1,2}\b')
# transformed = shortword.sub('', transformed)
# tokenize the transformed string
tokenized = nltk.word_tokenize(transformed)
# remove short words (less than 3 char)
tokenized = [w for w in tokenized if len(w) > 3]
tokenizer = LemTokens(tokenized)
return tokenizer
def LemNormalizeIt(text):
# convert non ascii characters
text = text.encode('ascii', 'replace').decode()
# remove punctuation and digits
remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
transformed = text.lower().translate(remove_punct_and_digits)
# tokenize the transformed string
tokenized = nltk.word_tokenize(transformed)
# apply lemming with morph it
morph_it = load_morph_it()
tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3]
return tokenized
def tag(self, lines):
'''
Tokenize and categorise the words in the collection of
text
:param lines: The list of strings with the text to match
:type lines: ``list`` of ``str``
:rtype: :class:
:return:
'''
try:
tokenized_words = nltk.word_tokenize(lines)
return nltk.pos_tag(tokenized_words)
except LookupError as le:
print("Run install_words.py first")
raise le
def _generate_candidate_keywords(self, sentences, max_length=3):
"""Creates a list of candidate keywords, or phrases of at most max_length words, from a set of sentences"""
phrase_list = []
for sentence in sentences:
words = map(lambda x: "|" if x in self.stopwords else x,
nltk.word_tokenize(sentence.lower()))
phrase = []
for word in words:
if word == "|" or is_punctuation(word):
if len(phrase) > 0:
if len(phrase) <= max_length:
phrase_list.append(phrase)
phrase = []
else:
phrase.append(word)
return phrase_list
def get_tokenizer(name, lowercase):
if name == 'char':
if lowercase:
return (lambda s: list(s.strip().lower()))
else:
return (lambda s: list(s.strip()))
elif (name == 'space') or (name == 'bpe'):
if lowercase:
return (lambda s: s.lower().split())
else:
return str.split
elif name == 'word':
if lowercase:
return (lambda s: word_tokenize(s.lower()))
else:
return word_tokenize
else:
raise ValueError('Unknown tokenizer: "%s"' % name)
def _set_tokenizer(self, tokenizer):
"""
Set tokenizer
:param tokenizer: tokenization method
:return: None
"""
if tokenizer == "nltk":
self.tokenizer = nltk.word_tokenize
elif tokenizer == "spacy":
spacy_en = spacy.load("en")
def spacy_tokenizer(seq):
return [w.text for w in spacy_en(seq)]
self.tokenizer = spacy_tokenizer
else:
raise ValueError("Invalid tokenizing method %s" % tokenizer)
def map_coocurence(context_size, data):
coocurrence_list = []
try:
if detect(data) == 'en':
region = nltk.word_tokenize(data)
for l_context, word, r_context in _context_windows(region, context_size, context_size):
if isWord(word):
for i, context_word in enumerate(l_context[::-1]):
if isWord(context_word):
coocurrence_list.append(((word, context_word), 1 / (i + 1)))
for i, context_word in enumerate(r_context):
if isWord(context_word):
coocurrence_list.append(((word, context_word), 1 / (i + 1)))
except LangDetectException:
return coocurrence_list
return coocurrence_list
def from_sentence(sent):
tokens = nltk.word_tokenize(sent)
tagged = nltk.pos_tag(tokens)
dg = DependencyGraph()
for (index, (word, tag)) in enumerate(tagged):
dg.nodes[index + 1] = {
'word': word,
'lemma': '_',
'ctag': tag,
'tag': tag,
'feats': '_',
'rel': '_',
'deps': defaultdict(),
'head': '_',
'address': index + 1,
}
dg.connect_graph()
return dg