def read_folder(self, folder_name, number_of_files_to_read=10000):
"""
Reads all files in a directory, splits them into sentences and puts these sentences in a list to return.
Args:
folder_name = the name of the folder to read files from
number_of_files_to_read = optional parameter for how many files in a directory to read
Returns:
A list of all sentences from all text files in the folder
"""
count = 0
all_sentences = []
for filename in os.listdir(folder_name):
if filename.endswith(".txt") and count < number_of_files_to_read:
main_text_to_open = folder_name + "/" + filename
main_text = self.open_file_single_string(main_text_to_open)
udata = main_text.decode("utf-8")
main_text = udata.encode("ascii", "ignore")
sentences = sent_tokenize(main_text)
for sentence in sentences:
all_sentences.append(sentence)
count += 1
return all_sentences
python类sent_tokenize()的实例源码
def create_batch(self, sentence_li):
"""Create a batch for a list of sentences."""
embeddings_batch = []
for sen in sentence_li:
embeddings = []
sent_toks = sent_tokenize(sen)
word_toks = [word_tokenize(el) for el in sent_toks]
tokens = [val for sublist in word_toks for val in sublist]
tokens = [el for el in tokens if el != '']
for tok in tokens:
embeddings.append(self.embdict.tok2emb.get(tok))
if len(tokens) < self.max_sequence_length:
pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))]
embeddings = pads + embeddings
else:
embeddings = embeddings[-self.max_sequence_length:]
embeddings = np.asarray(embeddings)
embeddings_batch.append(embeddings)
embeddings_batch = np.asarray(embeddings_batch)
return embeddings_batch
def article_to_pairs(arg):
article, direction = arg
pairs = []
if 'text' not in article:
return []
sents = sent_tokenize(article['text'], language='norwegian')
translations = translate(sents, direction)
for sent, trans in zip(sents, translations):
trans_tokens = tokenize(trans)
tokens = tokenize(sent)
pairs += compare(tokens, trans_tokens)
del article
del sents
del translations
return pairs
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def extractFeatures(self, article, n, customStopWords=None):
# pass in article as a tuple ( text, title)
text = article[0]
# extract the text
title = article[1]
# extract the title
sentences = sent_tokenize(text)
# split text into sentences
word_sent = [word_tokenize(sentences.lower()) for a in sentences]
# split sentences into words
self._freq = self._compute_frequencies(word_sent, customStopWords)
# calculate word freq using member func created above
if n < 0:
# how many features (words) to return - a -ve number means
# no feature ( word) selection, just return all features
return nlargest(len(self._freq_keys()),
self._freq, key=self._freq.get)
else:
# here we say if calling e func has asked for a subset
# then return only the 'n' largest features, i.e. the
# most important words ( important == frequent, less stopwords)
return nlargest(n, self._freq, key=self._freq.get)
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def summarize(self, article, n):
text = article[0]
text = article[1]
sentences = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sentences]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i, sentence in enumerate(word_sent):
for word in sentence:
if word in self._freq:
ranking[i] += self._freq[word]
sentences_index = nlargest(n, ranking, key=ranking.get)
return [sentences[j] for j in sentences_index]
##############################################################################
# TEST
def mmap_extract(text):
"""
Function-wrapper for metamap binary. Extracts concepts
found in text.
!!!! REMEMBER TO START THE METAMAP TAGGER AND
WordSense DISAMBIGUATION SERVER !!!!
Input:
- text: str,
a piece of text or sentence
Output:
- concepts: list,
list of metamap concepts extracted
"""
# Tokenize into sentences
sents = sent_tokenize(text)
mm = MetaMap.get_instance(settings['load']['path']['metamap'])
concepts, errors = mm.extract_concepts(sents, range(len(sents)),
word_sense_disambiguation=True)
if errors:
print 'Errors with extracting concepts!'
print errors
return concepts
def person_connotation(tweet, name):
"""
Decide whether a person is talked favorably about or not, based on the
tone of the sentences in which their name appears
"""
twtcontent = sent_tokenize(tweet)
overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}
mentions = 0
# analyze each sentence talking about `name` person
for s in twtcontent:
tags = get_tweet_tags(s)
# if the name appears in the tagged sentence, get its tone
if (name, 'NNP') in tags:
sentence = util.untag(tags)
scores = tweet_connotation(' '.join(sentence))
# add it up to the overall tweet's tone
for i, z in enumerate(scores):
overall[z] += scores[z]
mentions += 1
# averaging all sentences' scores. don't wanna divide by zero now do we
if mentions != 0:
for v in overall:
overall[v] = round(overall[v] / mentions, 3)
return overall
def make_summaries():
terms = Terms.objects.all()
removals = ['DEFINITION', 'BREAKING DOWN', 'What is']
for term in terms:
try:
summary = summarizer(term.text, settings.SUMMARIZER_SENTENCES)
sentence_tokens = sent_tokenize(summary)
text = ''
for sentence in sentence_tokens:
if not any(to_remove in sentence for to_remove in removals):
text += "{0} ".format(sentence.replace(r'\A[\d]\S\s', ''))
term.summary = summarizer(text, settings.SUMMARIZER_SENTENCES)
term.save()
except Exception as e:
print((coloredf.red("[ERROR] Ar terms summarizer: {0}".format(e))))
def clean_video(video):
text = []
try:
if len(video.description) > 0:
sentence_tokens = sent_tokenize(video.description)
for sentence in sentence_tokens:
if not ('http' in sentence):
text.append("{0} ".format(sentence))
video.description = "".join("{} ".format(s) for s in text)
video.save()
if settings.SHOW_DEBUG:
print(colored.green("Cleaned video description saved to db: {0}".format(video.title)))
except Exception as e:
print(colored.red("At clean_video {}".format(e)))
def doc_to_ids(self, doc, training=True):
l = []
words = dict()
window = 150
# doc = doc.replace("–", " ")
# doc = sent_tokenize(doc)
for sentence in doc:
miniArray = []
for term in sentence:
id = self.term_to_id(term, training)
if id != None:
miniArray.append(id)
if not id in words:
words[id] = 1
self.docfreq[id] += 1
if not len(miniArray):
continue
if len(miniArray) > window:
l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)])
else:
l.append(np.array(miniArray))
return l
def summarize(self, text, n):
"""
Return a list of n sentences
which represent the summary of text.
"""
sents = sent_tokenize(text)
assert n <= len(sents)
word_sent = [word_tokenize(s.lower()) for s in sents]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i,sent in enumerate(word_sent):
for w in sent:
if w in self._freq:
ranking[i] += self._freq[w]
sents_idx = self._rank(ranking, n)
return [sents[j] for j in sents_idx]
def parse_xml_language_similarity(file_read,file_write):
count = 0
with open(file_read,'r') as f, open(file_write,'w') as out:
for line in f:
count +=1
if count %1000 == 0: print(count)
if "row Id" in line:
line = line.strip()
root = xml.etree.ElementTree.fromstring(line)
try:
body = remove_tags(root.get('Body'))
title = remove_tags(root.get('Title'))
body_sentences = sent_tokenize(body)
title_sentences = sent_tokenize(title)
for line in body_sentences:
out.write(line+"\n")
for line in title_sentences:
out.write(line+"\n")
except:
continue
def train(self, chain_len = None):
""" Trains the markov data structure by creating chains of desired length """
if not chain_len:
chain_len = self.CHAIN_LENGTH
self.CHAIN_LEN = chain_len
self.everything['corpus'] = {}
self.corpus = self.everything['corpus']
for f in self.everything['input']:
for line in sent_tokenize( self.everything['input'][f] ):
words = word_tokenize(line)
for chain in self._make_chains(words):
k = " ".join( chain[:-1] ) # key is everything but last word
v = chain[-1] # value is last word
try:
self.corpus[k].append(v)
except:
self.corpus[k] = [v]
def nltk_extract_claims(text):
"""
Attempts to extract claims as a list from a large text string.
Uses nltk sent_tokenize function in tokenize library
param string text: string containing several claims
"""
sent_list = sent_tokenize(text)
# On a test string this returned a list with the claim number
# and then the claim text as separate items
claims_list = []
for i in range(0, len(sent_list), 2):
try:
number = int(sent_list[i].split(".")[0])
except:
number = 0
claims_list.append(
(number, sent_list[i+1])
)
return claims_list
def check_sentence(text):
"""
Check, that only one sentence was provided.
>>> QASystem.check_sentence("Example sentence.")
>>> QASystem.check_sentence("Example sentence. Another example.")
Traceback (most recent call last):
core.MultipleSentences: ['Example sentence.', 'Another example.']
Args:
text (str): provided question/answer.
Returns:
None
Raises:
MultipleSentenceQuestion: in case of more than one sentence inside
of the text string.
"""
sent_tokenize_list = sent_tokenize(text) # nltk tokenize sentence
if len(sent_tokenize_list) > 1:
raise MultipleSentences(sent_tokenize_list)
def read_yelp(file_name='yelp_academic_dataset_review.json'):
f = open(file_name)
f = f.readlines()
f = [eval(l.strip()) for l in f]
stars = [i['stars'] for i in f]
text = [i['text'] for i in f]
df = pd.DataFrame()
df['stars'] = stars
df['text'] = text
#compute the number of sentences in each doc
l = list(df.text)
text = [sent_tokenize(i) for i in list(df.text)]
text_len = [len(i) for i in text]
#2225188 in total
#2089287 for length<=20
#1654640 for length<=10
#We decide to only consider length<=7 here
df['length'] = text_len
df['text_split'] = text
return df
def get_sentiment(song):
scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])
if not song:
return scores
raw_text = song
raw_text = re.sub("\n", ". ", str(raw_text))
# Using already trained
sid = SentimentIntensityAnalyzer()
sentences = tokenize.sent_tokenize(raw_text)
scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])
for sentence in sentences:
ss = sid.polarity_scores(sentence)
for k in sorted(ss):
scores[k] += ss[k]
return scores
def offset_tokenize(text):
tail = text
accum = 0
tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
info_tokens = []
for tok in tokens:
scaped_tok = re.escape(tok)
m = re.search(scaped_tok, tail)
start, end = m.span()
# global offsets
gs = accum + start
ge = accum + end
accum += end
# keep searching in the rest
tail = tail[end:]
info_tokens.append((tok, (gs, ge)))
return info_tokens
def parse_xml_all(self, data_file, doc_type, language='english'):
e = ET.parse(data_file)
cluster_data = {}
root = e.getroot()
for topics in root:
data = []
topic_id = topics.attrib.get('id')
for documents in topics.findall(doc_type):
doc_id = documents.attrib.get('id')
if doc_type == 'document':
title_text = documents.find('title').text
doc_text = documents.find('text').text
text = text_normalization(doc_text)
doc_sents = sent_tokenize(text, language)
data.append([doc_id, doc_sents])
cluster_data[topic_id] = data
return cluster_data
def analysis(self, paragraph):
''' analysis sentiment given paragraph
'''
result = 0
counter = 0
sentences = tokenize.sent_tokenize(paragraph)
for sentence in sentences:
sentiment = self.analyzer.polarity_scores(sentence)['compound']
if sentiment > SentimentAnalyzer.neutral_threshold[0] and \
sentiment < SentimentAnalyzer.neutral_threshold[1]:
continue
counter += 1
result += sentiment
result = result / float(counter) if counter > 0 else 0
return result
def add_items(self, sentence_li):
"""Add new items to the tok2emb dictionary from a given text."""
for sen in sentence_li:
sent_toks = sent_tokenize(sen)
word_toks = [word_tokenize(el) for el in sent_toks]
tokens = [val for sublist in word_toks for val in sublist]
tokens = [el for el in tokens if el != '']
for tok in tokens:
if self.tok2emb.get(tok) is None:
self.tok2emb[tok] = self.fasttext_model[tok]
def get_sentiment_from_paragraph(paragraph):
sentence_list = tokenize.sent_tokenize(paragraph)
paragraphSentiments = 0.0
for sentence in sentence_list:
vs = analyzer.polarity_scores(sentence)
paragraphSentiments += vs["compound"]
return round(paragraphSentiments/len(sentence_list), 4)
def pre_processing(tokenizer, truecaser, info):
# SPLIT THE WHITESPACES
source_file_t = re.split('([\t\n\r\f\v]+)', info['src'])
# SENTENCE TOKENIZE
for i in range(len(source_file_t)):
if i % 2 == 0:
source_file_t[i] = sent_tokenize(source_file_t[i])
# TOKENIZATION
if info['tok']:
for j in range(len(source_file_t)):
if j % 2 == 0:
for i in range(len(source_file_t[j])):
try:
source_file_t[j][i] = str(
tokenizer.tokenize(source_file_t[j][i], return_str=True).encode('utf-8'))
except NameError:
source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.']))
# TRUECASING
if info['tc']:
for j in range(len(source_file_t)):
if j % 2 == 0:
for i in range(len(source_file_t[j])):
source_file_t[j][i] = str((truecasing(truecaser, source_file_t[j][i].split(' ')[0]).decode(
'utf-8') + " " + (' '.join(source_file_t[j][i].split(' ')[1:]).decode('utf-8'))).encode('utf-8'))
print source_file_t[j][i]
# IF NEITHER
if not (info['tc'] or info['tok']):
for j in range(len(source_file_t)):
if j % 2 == 0:
for i in range(len(source_file_t[j])):
try:
source_file_t[j][i] = str(source_file_t[j][i].encode('utf-8'))
except NameError:
source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.']))
return source_file_t
NewsArticleClass.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def extractRawFrequencies(self, article):
# this method is similar to above but returns
# the raw freq.cies ( all word count)
text = article[0]
text = article[1]
sentences = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sentences]
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in self._stopwords:
freq[word] += 1
return freq
def sentence(text):
'''Break the text into sentences'''
return sent_tokenize(text)
def getSentences(self):
self.sentences = sent_tokenize(self.text)
def metamap_wrapper(text):
"""
Function-wrapper for metamap binary. Extracts concepts
found in text.
!!!! REMEMBER TO START THE METAMAP TAGGER AND
WordSense DISAMBIGUATION SERVER !!!!
Input:
- text: str,
a piece of text or sentence
Output:
- a dictionary with key sents and values
a list of the concepts found
"""
# Tokenize into sentences
sents = sent_tokenize(text)
# Load Metamap Instance
mm = MetaMap.get_instance(settings['load']['path']['metamap'])
concepts, errors = mm.extract_concepts(sents, range(len(sents)))
# Keep the sentence ids
ids = np.array([int(concept[0]) for concept in concepts])
sentences = []
for i in xrange(len(sents)):
tmp = {'sent_id': i+1, 'entities': [], 'relations': []}
# Wanted concepts according to sentence
wanted = np.where(ids == i)[0].tolist()
for w_ind in wanted:
w_conc = concepts[w_ind]
if hasattr(w_conc, 'cui'):
tmp_conc = {'label': w_conc.preferred_name, 'cui': w_conc.cui,
'sem_types': w_conc.semtypes, 'score': w_conc.score}
tmp['entities'].append(tmp_conc)
sentences.append(tmp)
if errors:
time_log('Errors with extracting concepts!')
time_log(errors)
return {'sents': sentences, 'sent_text':text}
def reverb_wrapper(text, stop=None):
"""
Function-wrapper for ReVerb binary. Extracts relations
found in text.
Input:
- text: str,
a piece of text or sentence
- stop: list,
list of stopwords to remove from the relations
Output:
- total: list,
list of lists. Each inner list contains one relation in the form
[subject, predicate, object]
"""
total = []
for sent in sent_tokenize(text):
cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
reverb_dir = settings['load']['path']['reverb']
result = runProcess(cmd, reverb_dir)
# Extract relations from reverb output
result = result[-3:]
result = [row.split('\t')[1].strip('\n') for row in result]
# Remove common stopwords from relations
if stop:
result = [stopw_removal(res, stop) for res in result]
total.append(result)
# Remove empty relations
total = [t for t in total if t]
return total
def extract_entities(text, json_={}):
"""
Extract entities from a given text using metamap and
generate a json, preserving infro regarding the sentence
of each entity that was found. For the time being, we preserve
both concepts and the entities related to them
Input:
- text: str,
a piece of text or sentence
- json_: dic,
sometimes the json to be returned is given to us to be enriched
Defaults to an empty json_
Output:
- json_: dic,
json with fields text, sents, concepts and entities
containg the final results
"""
json_['text'] = text
# Tokenize the text
sents = sent_tokenize(text)
json_['sents'] = [{'sent_id': i, 'sent_text': sent} for i, sent in enumerate(sents)]
json_['concepts'], _ = mmap_extract(text)
json_['entities'] = {}
for i, sent in enumerate(json_['sents']):
ents = metamap_ents(sent)
json_['entities'][sent['sent_id']] = ents
return json_
def reverb_wrapper(text, stop=None):
"""
Function-wrapper for ReVerb binary. Extracts relations
found in text.
Input:
- text: str,
a piece of text or sentence
- stop: list,
list of stopwords to remove from the relations
Output:
- total: list,
list of lists. Each inner list contains one relation in the form
[subject, predicate, object]
"""
total = []
for sent in sent_tokenize(text):
cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
reverb_dir = settings['load']['path']['reverb']
result = runProcess(cmd, reverb_dir)
# Extract relations from reverb output
result = result[-3:]
result = [row.split('\t')[1].strip('\n') for row in result]
# Remove common stopwords from relations
if stop:
result = [stopw_removal(res, stop) for res in result]
total.append(result)
# Remove empty relations
total = [t for t in total if t]
return total