python类tokenize()的实例源码-面圈网

preprocess_data.py 文件源码项目：identifiera-sarkasm 作者: risnejunior 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def tokenize_text( sample_text ):
    global sequence_lengths
    processed_text = []

    if cfg.remove_punctuation:
        cleaned = sample_text.lower().translate( t_table )
    else:
        cleaned = sample_text

    if cfg.use_casual_tokenizer:
        tokens = tknzr.tokenize( cleaned )
    else:
        tokens = nltk.word_tokenize( cleaned, language='english')

    if cfg.remove_stopwords:
        tokens = [w for w in tokens if not w in stopwords.words('english')]

    sequence_lengths.append( len( tokens ) )
    processed_text.extend( tokens )

    return processed_text

util.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))

readability_utils.py 文件源码项目：django-icekit 作者: ic-labs 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_sentences(text=''):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)
    return sentences

util.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))

util.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))

util.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))

dialog_processor.py 文件源码项目：lasagne_seq2seq 作者: nicolas-ivanov 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def get_input_sequence(sentence):
    """
    Prepare chatbot's input by tokenizing the sentence and adding necessary punctuation marks.
    Input: "So what's up, buddy"
    Output: ["so", "what", "'", "s", "up", ",", "buddy", ".", "$$$"]
    """
    if not sentence:
        return [START_TOKEN, EOS_SYMBOL]

    # add a dot to the end of the sent in case there is no punctuation mark
    if sentence[-1] not in _PUNKT_MARKS:
        sentence += '.'

    sequence = [START_TOKEN] + tokenize(sentence) + [EOS_SYMBOL]

    return sequence

util.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))

util.py 文件源码项目：kind2anki 作者: prz3m 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))

util.py 文件源码项目：kind2anki 作者: prz3m 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            y.append(0) # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])

util.py 文件源码项目：but_sentiment 作者: MixedEmotions 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))

experiments.py 文件源码项目：clickbait 作者: bhargaviparanjape 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def handle_multiple_sentences(infile, outfile):
    titles = []
    f = open(infile, "r")
    f2 = codecs.open(outfile, "w+", "utf-8")
    for line in f:
        line = line.decode("utf-8")
        sentences = sent_detector.tokenize(line.strip())
        for i in range(len(sentences)):
            if i == 0:
                sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
            else:
                sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
                sentences[i-1] = sentences[i-1].replace(sentences[i-1].split()[-1][-1], " ::::")

        titles.append(" ".join(sentences))
    title_set = set(titles)
    for l in title_set:
        print >> f2, l

preprocessed_data.py 文件源码项目：diversity_based_attention 作者: PrekshaNema25 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def preprocess(s, max_tokens):
    #s = unicode(s, ignore="errors")
    s = s.lower()
    s = re.sub(r'[^\x00-\x7F]+',' ', s)
    s = re.sub("<s>", "", s)
    s = re.sub("<eos>", "", s)
    s = remove_punctuation(s)
    s = re.sub('\d','#',s)
    s = re.sub('\n',' ',s)
    s = re.sub(',',' ',s)

    tokens = WhitespaceTokenizer().tokenize(s)
    #s = replace_the_unfrequent(tokens)
    if (len(tokens) > max_tokens):
    tokens = tokens[:max_tokens]

    s = " ".join(tokens)
    return s, len(tokens)

raw_analysis.py 文件源码项目：AirbnbReviewAnalyzer 作者: mrsata 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def analysis(reviews_collection_text):
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        raw_data = f.read()
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        comments = f.readlines()
    data = raw_data.replace('\n', ' ')
    data_lower = data.lower()
    tokens_with_punc = word_tokenize(data_lower)
    tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
    print("--- Most frequent tokens ---\n",
        FreqDist(tokens_with_punc).most_common(15))
    print("--- Tokens without punctuation ---\n",
        FreqDist(tokens).most_common(15))
    stop = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop]
    print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
    tagged = pos_tag(words)
    nouns = [word for word, pos in tagged if (pos == 'NN')]
    print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
    adjts = [word for word, pos in tagged if (pos == 'JJ')]
    print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
    tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
    lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
    avgld = sum(lxdst) / len(comments)
    print("--- Average lexical density ---\n", avgld)

__NLPMODIFIED.py 文件源码项目：PYSHA 作者: shafaypro 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def parts_of_speechtag(self, sentences=""):
        from nltk.corpus import state_union  # for importing the already stored data, to be trained with
        from nltk.tokenize import PunktSentenceTokenizer  # importing the already POS intelligent punkbuster tokenizer
        training_text = state_union.raw("2005-GWBUSH.txt")  # Training set imported from the state union local repo.
        sample_text = sentences
        custom_sentence_tokenized = PunktSentenceTokenizer(train_text=training_text)
        # This is the unsupervised learning
        tokenization_unsupervised = custom_sentence_tokenized.tokenize(str(sample_text))

        # tokenizing using unsupervised learning
        # print(tokenization_unsupervised)  # just for hedebuggin purposes
        # print(type(tokenization_unsupervised))  # checking the type of the sentences

        self.processing_POS_tokenization(tokenization_unsupervised=tokenization_unsupervised)

        # Calling the Process content

phrasemachine.py 文件源码项目：phrasemachine 作者: slanglab 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html

nlp_utils.py 文件源码项目：search_relevance 作者: rmanak 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def normalize(self, text):
        return [self.stemmer.stem(token) 
                for token in self.tokenizer.tokenize(text.lower()) 
                if token not in self.stop_words]

######### defining a default normalizer ##########

nlp_utils.py 文件源码项目：search_relevance 作者: rmanak 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def normalize(self, text):
        return [token for token in self.tokenizer.tokenize(text.lower()) 
                if token not in self.stop_words]

skipthoughts.py 文件源码项目：TAC-GAN 作者: dashayushman 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X

skipthoughts.py 文件源码项目：how_to_convert_text_to_images 作者: llSourcell 项目源码文件源码阅读 54 收藏 0 点赞 0 评论 0

def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X

tweet.py 文件源码项目：SocialNPHS 作者: SocialNPHS 项目源码文件源码阅读 48 收藏 0 点赞 0 评论 0

def get_tweet_tags(tweet):
    """ Break up a tweet into individual word parts """
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    # replace handles with real names
    for n, tok in enumerate(tokens):
        if tok.startswith('@'):
            handle = tok.strip("@")
            if handle in user.students:
                # If we have a database entry for the mentioned user, we can
                # easily substitute a full name.
                usr = user.NPUser(handle)
                tokens[n] = usr.fullname
            else:
                # If there is no database entry, we use the user's alias. While
                # this is the full name in many cases, it is often not reliable
                usr = api.get_user(handle)
                tokens[n] = usr.name
    tagged = nltk.pos_tag(tokens)
    # In nltk, if a teacher's name is written with a period after an
    # abbreviated prefix, it is awkwardly broken up into 3 tags
    for n, tag in enumerate(tagged):
        # If there is the weird period after the prefix,
        if tag[1] == '.':
            # and it is in fact splitting up a person's name,
            if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
                if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
                    # combine it into the actual name,
                    tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
                                                     tagged[n + 1][0]), 'NNP')
                    # and then remove the extra tags.
                    del tagged[n + 1]
                    del tagged[n]
    return tagged

tokenizer.py 文件源码项目：kindlearadict 作者: runehol 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def tokenize(data):

    sent_tokenize = nltk.tokenize.sent_tokenize

    tokenizer = nltk.tokenize.RegexpTokenizer(u"[\s\.,-?!'\"??\d·•—()×«»%\[\]|?*]+", gaps=True)
    word_tokenize = tokenizer.tokenize


    for text, blockname, textname in data:
        sentences = sent_tokenize(text.strip())
        for sentence in sentences:
            words = word_tokenize(sentence)
            for word in words:
                if len(word) > 1:
                    yield (word, sentence, blockname, textname)

tokenizer.py 文件源码项目：tokenquery 作者: ramtinms 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def tokenize(self, text):
        """
           tokenize text into a list of Token objects

            :param text: text to be tokenized (might contains several sentences)
            :type text: str
            :return: List of Token objects
            :rtype: list(Token)
        """
        tokens = []

        if self.tokenizer_type == "SpaceTokenizer":
            operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer":
            operator = WhitespaceTokenizer()
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "PTBTokenizer":
            ptb_tokens = word_tokenize(text)
            counter = 0
            for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens):
                new_token = Token(counter, token, span[0], span[1])
                counter += 1
                tokens.append(new_token)

        return tokens

page-rank.py 文件源码项目：Education-Explorer 作者: imbiswas 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def __tokenizeWords(sentence):
    return nltk.tokenize.word_tokenize(sentence)


## tests ########################################################################################

documents.py 文件源码项目：MOQA 作者: pprakhar30 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def __init__(self, itemId, questionType, answerType, question, answer, V, WordIDMap):

        self.itemId         = itemId
        self.questionType   = questionType
        self.answerType     = answerType
        self.question       = question
        self.answer         = answer
        self.Question       = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(question) if stemmer.stem(word) in WordIDMap]
        self.Answer         = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(answer) if stemmer.stem(word) in WordIDMap]
        self.qFeature       = {}
        self.aFeature       = {}
        self.create_QAFeature()

documents.py 文件源码项目：MOQA 作者: pprakhar30 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def __init__(self, itemId, Review, V, WordIDMap, ReviewObj):

        self.itemId     = itemId
        self.sent   = Review
        self.rObj   = ReviewObj
        self.Sent   = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(Review) if stemmer.stem(word) in WordIDMap]
        self.sFeature   = {}

skipthoughts.py 文件源码项目：StackGAN 作者: hanzhanggit 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X

corpus.py 文件源码项目：minke 作者: DistrictDataLabs 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def sents(self, fileids=None, categories=None):
        """
        Uses the built in sentence tokenizer to extract sentences from the
        paragraphs. Note that this method uses BeautifulSoup to parse HTML.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in self._sent_tokenizer.tokenize(paragraph):
                yield sentence

corpus.py 文件源码项目：minke 作者: DistrictDataLabs 项目源码文件源码阅读 55 收藏 0 点赞 0 评论 0

def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        for sentence in self.sents(fileids, categories):
            for token in self._word_tokenizer.tokenize(sentence):
                yield token

corpus.py 文件源码项目：minke 作者: DistrictDataLabs 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }