def tokenize_and_stem(text):
"""
First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
"""
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
if 'intern' == token:
token = ''
if 'student' == token:
token = ''
if 'and' == token:
token = ''
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
return stems
python类sent_tokenize()的实例源码
readdata.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def preprocessing(text):
text = text.decode("utf8")
# tokenize into words
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
# remove stopwords
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# remove words less than three letters
tokens = [word for word in tokens if len(word) >= 3]
# lower capitalization
tokens = [word.lower() for word in tokens]
# lemmatize
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text= ' '.join(tokens)
return preprocessed_text
def get_sentence_tokens(text):
'''
Given a text(review), return the token list of each sentence
:param text:
:return:
'''
sentences = sent_tokenize(text)
sent_tokens = []
for sentence in sentences:
sent_token = word_tokenize(sentence)
sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))]
sent_tokens.append(sent_token)
# remove stop words and short tokens
# stemming, experiment shows that stemming works nothing...
# if (stemming):
# stemmer = PorterStemmer()
# texts = [[ stemmer.stem(token) for token in text] for text in texts]
return sent_tokens
def parse_gender(text):
sentences = [
[word.lower() for word in nltk.word_tokenize(sentence)]
for sentence in nltk.sent_tokenize(text)
]
sents, words = count_gender(sentences)
total = sum(words.values())
for gender, count in words.items():
pcent = (count / total) * 100
nsents = sents[gender]
print(
"{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
)
def ie_preprocess(self, document):
"""This function takes raw text and chops and then connects the process to break
it down into sentences"""
# Pre-processing
# e.g.","exempli gratia"
document = document.replace("e.g.", "exempli gratia")
# Sentence tokenizer out of nltk.sent_tokenize
split = re.split('\n|\*', document)
# Sentence tokenizer
sentences = []
for sent in split:
sents = nltk.sent_tokenize(sent)
length = len(sents)
if length == 0:
next
elif length == 1:
sentences.append(sents[0])
else:
for i in range(length):
sentences.append(sents[i])
return sentences
def maybe_build_sentences(text_filename, sent_filename):
sents = []
if os.path.exists(sent_filename):
fsent = open(sent_filename, "rb")
for line in fsent:
docid, sent_id, sent = line.strip().split("\t")
sents.append(sent)
fsent.close()
else:
ftext = open(text_filename, "rb")
fsent = open(sent_filename, "wb")
for line in ftext:
docid, text = line.strip().split("\t")
sent_id = 1
for sent in nltk.sent_tokenize(text):
sents.append(sent)
fsent.write("{:d}\t{:d}\t{:s}\n"
.format(int(docid), sent_id, sent))
sent_id += 1
fsent.close()
ftext.close()
return sents
def maybe_build_sentences(text_filename, sent_filename):
sents = []
if os.path.exists(sent_filename):
fsent = open(sent_filename, "rb")
for line in fsent:
docid, sent_id, sent = line.strip().split("\t")
sents.append(sent)
fsent.close()
else:
ftext = open(text_filename, "rb")
fsent = open(sent_filename, "wb")
for line in ftext:
docid, text = line.strip().split("\t")
sent_id = 1
for sent in nltk.sent_tokenize(text):
sents.append(sent)
fsent.write("{:d}\t{:d}\t{:s}\n"
.format(int(docid), sent_id, sent))
sent_id += 1
fsent.close()
ftext.close()
return sents
def get_review_sentences():
'''
Read the yelp review and return after sentence segmentattion
:return:
'''
review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8')
count_sentence = 0
sentences = []
for line in review_file:
json_review = json.loads(line.strip())
text = json_review.get("text").replace('\n','').lower()
raw_sentences = sent_tokenize(text)
for raw_sentence in raw_sentences:
if len(raw_sentence.strip()) > 0:
sent_tokens = word_tokenize(raw_sentence)
sentences.append(sent_tokens)
return sentences
AKE.py 文件源码
项目:NLP-Keyword-Extraction-Ensemble-Method
作者: Ashwin-Ravi
项目源码
文件源码
阅读 39
收藏 0
点赞 0
评论 0
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group).lower()
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates
if cand not in stop_words and not all(char in punct for char in cand)]
def print_symptoms_from_page(url = '', model = '', stanford_jar = ''):
html_reader = HTMLReader(url)
cleaned_text = html_reader.get_text_from_page()
symptoms = set()
st = NERTagger(model, stanford_jar, encoding='utf-8')
sentences = nltk.sent_tokenize(cleaned_text)
for sentence in sentences:
tags = st.tag(nltk.word_tokenize(sentence))
tag_index = 0
while tag_index < len(tags):
if tags[tag_index][1] == 'SYMP':
symptom = []
while tag_index < len(tags) and tags[tag_index][1] != 'O':
symptom.append(tags[tag_index][0])
tag_index += 1
symptoms.add(' '.join(symptom))
else:
tag_index += 1
print "Found %d symptoms:" % len(symptoms)
for symptom in symptoms:
print symptom
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
''' This function will extract text of a specific POS sequence rather than just Noun Phrase '''
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group)
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates
if cand not in stop_words and not all(char in punct for char in cand)]
def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document):
# Break the sentence into part of speech tagged tokens
for token, tag in pos_tag(wordpunct_tokenize(sent)):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
# if token in self.stopwords:
# continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
lemma = self.lemmatize(token, tag)
yield lemma
def process(self, fc, context=None):
text_source = self.config.get('text_source')
if text_source and text_source in fc:
text = fc[text_source]
else:
return fc
names = defaultdict(StringCounter)
for sent in nltk.sent_tokenize(text):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
label = chunk.label()
name = ' '.join(c[0] for c in chunk.leaves())
if not isinstance(name, unicode):
name = unicode(name, 'utf-8')
name = cleanse(name)
#print chunk.node, name
names[label][name] += 1
for entity_type, name_counts in names.items():
fc[entity_type] = name_counts
return fc
def generate_vocab(filename,min_fre=5,prefix=""):
vf = open("../data/"+prefix+"vocab_generate.txt",'w')
word = {}
for line in file(filename):
line = line.strip()
try:
sentencesToken = nltk.sent_tokenize(line)
except:
continue
for i in range(len(sentencesToken)):
tokens = nltk.word_tokenize(sentencesToken[i])
for token in tokens:
word.setdefault(token,0)
word[token] += 1
for char,num in sorted(word.items(),key=lambda x:x[1],reverse=True):
if num < min_fre:
break
vf.write(char+" "+str(num)+"\n")
def extract_chunks(text_string,max_words=3,lemmatize=False):
# Any number of adjectives followed by any number of nouns and (optionally) again
# any number of adjectives folowerd by any number of nouns
grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
# Makes chunks using grammar regex
chunker = nltk.RegexpParser(grammar)
# Get grammatical functions of words
# What this is doing: tag(sentence -> words)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))
# Make chunks from the sentences, using grammar. Output in IOB.
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# Join phrases based on IOB syntax.
candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]
# Filter by maximum keyphrase length
candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))
# Filter phrases consisting of punctuation or stopwords
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates))
# lemmatize
if lemmatize:
lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
candidates = [lemmatizer(x) for x in candidates]
return candidates
def tokenizer():
if len(request.vars)!=0:
user_input=request.vars
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
if user_input.parameter=="sentence":
our_output=nltk.sent_tokenize(user_input.input,"english")
print user_input
if request.vars.filename!='' and len(request.vars.filename.value)!="":
file_input=user_input.filename.value
file_output=nltk.word_tokenize(file_input,"english")
print our_output
else:
our_output=nltk.word_tokenize(user_input.input,"english")
if request.vars.filename!='' and len(request.vars.filename.value)!=None:
file_input=user_input.filename.value
file_output=nltk.word_tokenize(file_input,"english")
user_input.output=our_output
return locals()
def tokenizer():
if len(request.vars)!=0:
user_input=request.vars
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
if user_input.parameter=="sentence":
our_output=nltk.sent_tokenize(user_input.input,"english")
print user_input
if request.vars.filename!='' and len(request.vars.filename.value)!="":
file_input=user_input.filename.value
file_output=nltk.word_tokenize(file_input,"english")
print our_output
else:
our_output=nltk.word_tokenize(user_input.input,"english")
if request.vars.filename!='' and len(request.vars.filename.value)!=None:
file_input=user_input.filename.value
file_output=nltk.word_tokenize(file_input,"english")
user_input.output=our_output
return locals()
def extract(text, paper=None, logger=logger):
search_any = functools.partial(re_util.search_any, logger=logger)
if not text and paper:
try:
text, _ = paper.get_text()
except pdfutil.pdfutil.MalformedPDF as e:
return None
filters = [r'data documentation.*?shared']
for sentence in nltk.sent_tokenize(text):
match = search_any(filters, sentence)
if match:
source_type = "extracted"
source_detail = "nltk search v1"
value_text = sentence
value_result = "Yes"
return (value_text, value_result, source_type, source_detail)
#if no match found:
source_type = "extracted"
source_detail = "nltk search v1"
value_text = "Not Found"
value_result = "No"
return (value_text, value_result, source_type, source_detail)
def extract(text, paper=None, logger=logger):
search_any = functools.partial(re_util.search_any, logger=logger)
if not text and paper:
try:
text, _ = paper.get_text()
except pdfutil.pdfutil.MalformedPDF as e:
return None
for sentence in nltk.sent_tokenize(text):
if search_any([r'data mine.*?source', r'text mine.*?shared'], sentence):
# yapf: disable
match = search_any([
"data mine.*?(\w*\d[\w\d/-]*)",
"text mine.*?(\w*\d[\w\d/-]*)"
], sentence)
# yapf: enable
source_type = "extracted"
source_detail = "nltk search v1"
value_text = sentence
try:
value_result = match.group(1).strip()
return (value_text, value_result, source_type, source_detail)
except AttributeError: # no match was found
return None
return None
def extract(text, paper=None, logger=logger):
search_any = functools.partial(re_util.search_any, logger=logger)
if not text and paper:
try:
text, _ = paper.get_text()
except pdfutil.pdfutil.MalformedPDF as e:
return None
filters = [r'analys(is|es)']
for sentence in nltk.sent_tokenize(text):
match = search_any(filters, sentence)
if match and search_any([r'algorithm', r'summary', r'outline', r'statistic', r'table|graph', r'following'], sentence):
source_type = "extracted"
source_detail = "nltk search v1"
value_text = sentence
value_result = "Yes"
return (value_text, value_result, source_type, source_detail)
#if no match found:
source_type = "extracted"
source_detail = "nltk search v1"
value_text = "Not Found"
value_result = "No"
return (value_text, value_result, source_type, source_detail)
def get_story_question_answer_triples(sqa_file):
sqatriples = []
fsqa = open(sqa_file, "rb")
for line in fsqa:
line = line.strip().decode("utf8").encode("ascii", "ignore")
if line.startswith("#"):
continue
story, question, answer, correct = line.split("\t")
swords = []
story_sents = nltk.sent_tokenize(story)
for story_sent in story_sents:
swords.extend(nltk.word_tokenize(story_sent))
qwords = nltk.word_tokenize(question)
awords = nltk.word_tokenize(answer)
is_correct = int(correct) == 1
sqatriples.append((swords, qwords, awords, is_correct))
fsqa.close()
return sqatriples
sent-thoughts-parse.py 文件源码
项目:Deep-Learning-with-Keras
作者: PacktPublishing
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def maybe_build_vocab(reuters_dir, vocab_file):
vocab = collections.defaultdict(int)
if os.path.exists(vocab_file):
fvoc = open(vocab_file, "rb")
for line in fvoc:
word, idx = line.strip().split("\t")
vocab[word] = int(idx)
fvoc.close()
else:
counter = collections.Counter()
num_docs_read = 0
for doc in stream_reuters_documents(reuters_dir):
if num_docs_read % 100 == 0:
print("building vocab from {:d} docs"
.format(num_docs_read))
topics = doc["topics"]
if len(topics) == 0:
continue
title = doc["title"]
body = doc["body"]
title_body = ". ".join([title, body]).lower()
for sent in nltk.sent_tokenize(title_body):
for word in nltk.word_tokenize(sent):
counter[word] += 1
for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
vocab[c[0]] = i + 1
num_docs_read += 1
print("vocab built from {:d} docs, complete"
.format(num_docs_read))
fvoc = open(vocab_file, "wb")
for k in vocab.keys():
fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
fvoc.close()
return vocab
sent-thoughts-parse.py 文件源码
项目:Deep-Learning-with-Keras
作者: PacktPublishing
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def build_numeric_text(vocab, text):
wids = []
for sent in nltk.sent_tokenize(text):
for word in nltk.word_tokenize(sent):
wids.append(vocab[word])
return ",".join([str(x) for x in wids])
##################### main ######################
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
if 'and' == token:
token = ''
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
return stems
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def split_sentences(text):
"""
Returns a list of the sentences in the text that is passed in.
"""
return sent_tokenize(text)
preprocess_data.py 文件源码
项目:kaggle_redefining_cancer_treatment
作者: jorgemf
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def tokenize_documents(documents):
for document in documents:
text = document.text
tokenized_doc = []
for sent in nltk.sent_tokenize(text):
tokenized_doc += nltk.word_tokenize(sent)
document.text = tokenized_doc
def _tokenize_corpus_into_list_of_tokenized_sentences(cls, corpus):
tokenized_corpus = nltk.sent_tokenize(corpus)
tokenized_corpus = [cls._clean_sentence(sentence) for sentence in tokenized_corpus]
return [nltk.word_tokenize(sentence) for sentence in tokenized_corpus]
def extract(self, text, max_length=3, metric='avg', incl_scores=False):
"""Extract keywords and keyphrases from input text in descending order of score"""
sentences = nltk.sent_tokenize(text)
phrase_list = self._generate_candidate_keywords(sentences, max_length=max_length)
word_scores = self._calculate_word_scores(phrase_list)
phrase_scores = self._calculate_phrase_scores(phrase_list, word_scores, metric=metric)
sorted_phrase_scores = sorted(phrase_scores.iteritems(), key=operator.itemgetter(1), reverse=True)
n_phrases = len(sorted_phrase_scores)
if incl_scores:
return sorted_phrase_scores[0:int(n_phrases/self.top_fraction)]
else:
return map(lambda x: x[0], sorted_phrase_scores[0:int(n_phrases/self.top_fraction)])