python类data()的实例源码

preprocess2.py 文件源码 项目:SCDL 作者: lngvietthang 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def load_data_from_json2(path_to_json, test_split, vocabulary_size):
    '''
    Load data for training and testing from json file
    :param path_to_json: path to json file
    :param word2vec_dict: dictionary of word2vec
    :return: X_train, y_train, X_test, y_test
    '''
    X=[]
    y=[]
    len_sent_array=[]
    sample_weight=[]
    objests=read_json_file(path_to_json)
    print 'Data %d sentences'%len(objests)
    i=0
    original_sentence_array=[]
    compression_sentence_array=[]
    word2indext_dict, _ = word2index(objests, vocabulary_size)
    for object in objests:
        original_sentence, compression_sentence = get_originalSent_compressionSent(object)
        (array_sent, sample_w) = word2vec(original_sentence, word2indext_dict)
        X.append(array_sent)
        sample_weight.append(sample_w)
        (y_l,l) = label_compress(original_sentence, compression_sentence)
        y.append(y_l)
        len_sent_array.append(l)
        i+=1
        if i%100==0:
            sys.stdout.write('.')
        #get text array:
        original_sentence_array.append(original_sentence)
        compression_sentence_array.append(compression_sentence)
    return ((X[int(len(X)*test_split):],y[int(len(y)*test_split):], len_sent_array[int(len(len_sent_array)*test_split):], sample_weight[int(len(sample_weight)*test_split):]), (X[:int(len(X)*test_split)], y[:int(len(y)*test_split)], len_sent_array[:int(len(len_sent_array)*test_split)], sample_weight[:int(len(sample_weight)*test_split)]), (original_sentence_array, compression_sentence_array))
corpus.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 44 收藏 0 点赞 0 评论 0
def feeds(self):
        """
        Opens and returns the collection of feeds associated with the corpus.
        """
        data = self.open('feeds.json')
        return json.load(data)
corpus.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 44 收藏 0 点赞 0 评论 0
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }
corpus.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 46 收藏 0 点赞 0 评论 0
def html(self, fileids=None, categories=None):
        """
        The preprocessed pickles do not contain HTML data.
        """
        raise TypeError(
            "Preprocessed corpus does not contain HTML data."
        )
utils.py 文件源码 项目:EventMiner 作者: hltcoe 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def prep_data(data):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sents = sent_detector.tokenize(data['content'].strip())
    sent_dict = {str(uuid.uuid4()): {'text': x} for x in sents[:2]}
    data['sents'] = sent_dict

    return data
grammar.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print()
childes_fixt.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def setup_module(module):
    from nose import SkipTest
    import nltk.data
    try:
        nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
    except LookupError as e:
        print(e)
        raise SkipTest("The CHILDES corpus is not found. "
                       "It should be manually downloaded and saved/unpacked "
                       "to [NLTK_Data_Dir]/corpora/childes/")
test_corpus_views.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data
test_corpus_views.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
chat80.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension
chat80.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs
chat80.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def process_bundle(rels):
    """
    Given a list of relation metadata bundles, make a corresponding
    dictionary of concepts, indexed by the relation name.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list(dict)
    :return: a dictionary of concepts, indexed by the relation name.
    :rtype: dict(str): Concept 
    """
    concepts = {}
    for rel in rels:
        rel_name = rel['rel_name']
        closures = rel['closures']
        schema = rel['schema']
        filename = rel['filename']

        concept_list = clause2concepts(filename, rel_name, schema, closures)
        for c in concept_list:
            label = c.prefLabel
            if (label in concepts):
                for data in c.extension:
                    concepts[label].augment(data)
                concepts[label].close()
            else:
                concepts[label] = c
    return concepts
chat80.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: str
    """
    dbname = db+".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation
        val = Valuation(db_in)
#        val.read(db_in.items())
        return val


#def alpha(str):
    #"""
    #Utility to filter out non-alphabetic constants.

    #:param str: candidate constant
    #:type str: string
    #:rtype: bool
    #"""
    #try:
        #int(str)
        #return False
    #except ValueError:
        ## some unknown values in records are labeled '?'
        #if not str == '?':
            #return True
train_text8model.py 文件源码 项目:w2vec-similarity 作者: jayantj 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def tokenize_sentences(text):
  import nltk.data
  sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
  return sent_tokenizer.tokenize(text)
Word2Vec.py 文件源码 项目:Humour-Detection 作者: srishti-1795 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def readFileOfReviews():
# Read each review from file
        global reviewsLst
    preview = open("data.txt", "rb")
    reviewsLst =  pickle.load(preview)
data_helpers.py 文件源码 项目:DNN-Sentiment 作者: awjuliani 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def load_data():
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]
grammar.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print()
childes_fixt.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def setup_module(module):
    from nose import SkipTest
    import nltk.data
    try:
        nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
    except LookupError as e:
        print(e)
        raise SkipTest("The CHILDES corpus is not found. "
                       "It should be manually downloaded and saved/unpacked "
                       "to [NLTK_Data_Dir]/corpora/childes/")
test_corpus_views.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data
test_corpus_views.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))


问题


面经


文章

微信
公众号

扫码关注公众号