def load_data_from_json2(path_to_json, test_split, vocabulary_size):
'''
Load data for training and testing from json file
:param path_to_json: path to json file
:param word2vec_dict: dictionary of word2vec
:return: X_train, y_train, X_test, y_test
'''
X=[]
y=[]
len_sent_array=[]
sample_weight=[]
objests=read_json_file(path_to_json)
print 'Data %d sentences'%len(objests)
i=0
original_sentence_array=[]
compression_sentence_array=[]
word2indext_dict, _ = word2index(objests, vocabulary_size)
for object in objests:
original_sentence, compression_sentence = get_originalSent_compressionSent(object)
(array_sent, sample_w) = word2vec(original_sentence, word2indext_dict)
X.append(array_sent)
sample_weight.append(sample_w)
(y_l,l) = label_compress(original_sentence, compression_sentence)
y.append(y_l)
len_sent_array.append(l)
i+=1
if i%100==0:
sys.stdout.write('.')
#get text array:
original_sentence_array.append(original_sentence)
compression_sentence_array.append(compression_sentence)
return ((X[int(len(X)*test_split):],y[int(len(y)*test_split):], len_sent_array[int(len(len_sent_array)*test_split):], sample_weight[int(len(sample_weight)*test_split):]), (X[:int(len(X)*test_split)], y[:int(len(y)*test_split)], len_sent_array[:int(len(len_sent_array)*test_split)], sample_weight[:int(len(sample_weight)*test_split)]), (original_sentence_array, compression_sentence_array))
python类data()的实例源码
def feeds(self):
"""
Opens and returns the collection of feeds associated with the corpus.
"""
data = self.open('feeds.json')
return json.load(data)
def describe(self, fileids=None, categories=None):
"""
Performs a single pass of the corpus and returns a dictionary with a
variety of metrics concerning the state of the corpus.
"""
# Structures to perform counting.
counts = nltk.FreqDist()
tokens = nltk.FreqDist()
started = time.time()
# Perform single pass over paragraphs, tokenize and count
for para in self.paras(fileids, categories):
counts['paras'] += 1
for sent in self._sent_tokenizer.tokenize(para):
counts['sents'] += 1
for word in self._word_tokenizer.tokenize(sent):
counts['words'] += 1
tokens[word] += 1
# Compute the number of files and categories in the corpus
n_fileids = len(self._resolve(fileids, categories) or self.fileids())
n_topics = len(self.categories(self._resolve(fileids, categories)))
# Return data structure with information
return {
'files': n_fileids,
'topics': n_topics,
'paras': counts['paras'],
'sents': counts['sents'],
'words': counts['words'],
'vocab': len(tokens),
'lexdiv': float(counts['words']) / float(len(tokens)),
'ppdoc': float(counts['paras']) / float(n_fileids),
'sppar': float(counts['sents']) / float(counts['paras']),
'secs': time.time() - started,
}
def html(self, fileids=None, categories=None):
"""
The preprocessed pickles do not contain HTML data.
"""
raise TypeError(
"Preprocessed corpus does not contain HTML data."
)
def prep_data(data):
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_detector.tokenize(data['content'].strip())
sent_dict = {str(uuid.uuid4()): {'text': x} for x in sents[:2]}
data['sents'] = sent_dict
return data
def fcfg_demo():
import nltk.data
g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
print(g)
print()
def setup_module(module):
from nose import SkipTest
import nltk.data
try:
nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
except LookupError as e:
print(e)
raise SkipTest("The CHILDES corpus is not found. "
"It should be manually downloaded and saved/unpacked "
"to [NLTK_Data_Dir]/corpora/childes/")
def data(self):
for name in self.names:
f = nltk.data.find(name)
with f.open() as fp:
file_data = fp.read().decode('utf8')
yield f, file_data
def test_correct_length(self):
# Check that the corpus views report the correct lengths:
for f, file_data in self.data():
v = StreamBackedCorpusView(f, read_whitespace_block)
self.assertEqual(len(v), len(file_data.split()))
v = StreamBackedCorpusView(f, read_line_block)
self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
def augment(self, data):
"""
Add more data to the ``Concept``'s extension set.
:param data: a new semantic value
:type data: string or pair of strings
:rtype: set
"""
self._extension.add(data)
self.extension = sorted(list(self._extension))
return self._extension
def _str2records(filename, rel):
"""
Read a file into memory and convert each relation clause into a list.
"""
recs = []
contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
for line in contents.splitlines():
if line.startswith(rel):
line = re.sub(rel+r'\(', '', line)
line = re.sub(r'\)\.$', '', line)
record = line.split(',')
recs.append(record)
return recs
def process_bundle(rels):
"""
Given a list of relation metadata bundles, make a corresponding
dictionary of concepts, indexed by the relation name.
:param rels: bundle of metadata needed for constructing a concept
:type rels: list(dict)
:return: a dictionary of concepts, indexed by the relation name.
:rtype: dict(str): Concept
"""
concepts = {}
for rel in rels:
rel_name = rel['rel_name']
closures = rel['closures']
schema = rel['schema']
filename = rel['filename']
concept_list = clause2concepts(filename, rel_name, schema, closures)
for c in concept_list:
label = c.prefLabel
if (label in concepts):
for data in c.extension:
concepts[label].augment(data)
concepts[label].close()
else:
concepts[label] = c
return concepts
def val_load(db):
"""
Load a ``Valuation`` from a persistent database.
:param db: name of file from which data is read.
The suffix '.db' should be omitted from the name.
:type db: str
"""
dbname = db+".db"
if not os.access(dbname, os.R_OK):
sys.exit("Cannot read file: %s" % dbname)
else:
db_in = shelve.open(db)
from nltk.sem import Valuation
val = Valuation(db_in)
# val.read(db_in.items())
return val
#def alpha(str):
#"""
#Utility to filter out non-alphabetic constants.
#:param str: candidate constant
#:type str: string
#:rtype: bool
#"""
#try:
#int(str)
#return False
#except ValueError:
## some unknown values in records are labeled '?'
#if not str == '?':
#return True
def tokenize_sentences(text):
import nltk.data
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
return sent_tokenizer.tokenize(text)
def readFileOfReviews():
# Read each review from file
global reviewsLst
preview = open("data.txt", "rb")
reviewsLst = pickle.load(preview)
def load_data():
"""
Loads and preprocessed data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# Load and preprocess data
sentences, labels = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x, y = build_input_data(sentences_padded, labels, vocabulary)
return [x, y, vocabulary, vocabulary_inv]
grammar.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def fcfg_demo():
import nltk.data
g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
print(g)
print()
childes_fixt.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def setup_module(module):
from nose import SkipTest
import nltk.data
try:
nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
except LookupError as e:
print(e)
raise SkipTest("The CHILDES corpus is not found. "
"It should be manually downloaded and saved/unpacked "
"to [NLTK_Data_Dir]/corpora/childes/")
test_corpus_views.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def data(self):
for name in self.names:
f = nltk.data.find(name)
with f.open() as fp:
file_data = fp.read().decode('utf8')
yield f, file_data
test_corpus_views.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def test_correct_length(self):
# Check that the corpus views report the correct lengths:
for f, file_data in self.data():
v = StreamBackedCorpusView(f, read_whitespace_block)
self.assertEqual(len(v), len(file_data.split()))
v = StreamBackedCorpusView(f, read_line_block)
self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))