python类word_tokenize()的实例源码

textcat.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
markov2.py 文件源码 项目:markov_bot 作者: 18F 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def train(self, chain_len = None):
        """ Trains the markov data structure by creating chains of desired length """
        if not chain_len:
            chain_len = self.CHAIN_LENGTH

        self.CHAIN_LEN = chain_len

        self.everything['corpus'] = {}
        self.corpus = self.everything['corpus']

        for f in self.everything['input']:
            for line in sent_tokenize( self.everything['input'][f] ):
                words = word_tokenize(line)

                for chain in self._make_chains(words):
                    k = " ".join( chain[:-1] ) # key is everything but last word
                    v = chain[-1] # value is last word

                    try:
                        self.corpus[k].append(v)
                    except:
                        self.corpus[k] = [v]
gender.py 文件源码 项目:atap 作者: foxbook 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        )
dataset.py 文件源码 项目:unsupervised-treelstm 作者: jihunchoi 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def _convert_obj(self, obj):
        pre_sentence = obj['sentence1']
        hyp_sentence = obj['sentence2']
        if self.lower:
            pre_sentence = pre_sentence.lower()
            hyp_sentence = hyp_sentence.lower()
        pre_words = word_tokenize(pre_sentence)
        hyp_words = word_tokenize(hyp_sentence)
        pre = [self.word_vocab.word_to_id(w) for w in pre_words]
        hyp = [self.word_vocab.word_to_id(w) for w in hyp_words]
        pre_length = len(pre)
        hyp_length = len(hyp)
        label = obj['gold_label']
        if len(pre) > self._max_length or len(hyp) > self._max_length:
            return None
        if label == '-':
            return None
        label = self.label_vocab.word_to_id(label)
        return pre, hyp, pre_length, hyp_length, label
get_texts.py 文件源码 项目:RealEstateTelegramBot 作者: PeterZhizhin 项目源码 文件源码 阅读 122 收藏 0 点赞 0 评论 0
def tokenize_me(file_text):
    #firstly let's apply nltk tokenization
    tokens = nltk.word_tokenize(file_text)

    #let's delete punctuation symbols
    tokens = [i for i in tokens if i not in string.punctuation]

    #deleting stop_words
    tokens = [i for i in tokens if i not in stop_words]

    #cleaning words
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]

    tokens = [stemmer.stem(i) for i in tokens]

    return set(tokens)
train.py 文件源码 项目:poetic-inner-join 作者: emdaniels 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def tokenize_sentences(self):
        # tokenize the sentences into words and count the word frequencies
        # get most common words, build index_to_word and word_to_index vectors
        self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
                                    self.sentences]
        word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
        print("Found %d unique word tokens." % len(word_freq.items()))

        vocab = word_freq.most_common(self.vocabulary_size - 1)
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict(
            [(w, i) for i, w in enumerate(self.index_to_word)])

        print("Using vocabulary size %d." % self.vocabulary_size)
        print(
            "The least frequent word is '%s' appearing %d times." % (
            vocab[-1][0], vocab[-1][1]))

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(self.tokenized_sentences):
            self.tokenized_sentences[i] = [
                w if w in self.word_to_index else self.unknown_token for w in
                sent]
Features5.py 文件源码 项目:kaggle 作者: rbauld 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

###############################################################################
# Train
reader.py 文件源码 项目:YelpDataChallenge 作者: fujunswufe 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_review_sentences():
    '''
    Read the yelp review and return after sentence segmentattion
    :return:
    '''
    review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8')
    count_sentence = 0
    sentences = []

    for line in review_file:
        json_review = json.loads(line.strip())
        text = json_review.get("text").replace('\n','').lower()

        raw_sentences = sent_tokenize(text)
        for raw_sentence in raw_sentences:
            if len(raw_sentence.strip()) > 0:
                sent_tokens = word_tokenize(raw_sentence)
                sentences.append(sent_tokens)
    return sentences
ffn.py 文件源码 项目:Humour-Detection 作者: srishti-1795 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def createTrainingList(reviewLst):
    sds = SupervisedDataSet(100,1)
    for review in reviewLst:
        revString = unicode(review[1], errors='ignore')
        revSentences = nltk.word_tokenize(revString.strip())
        revWords = []
        for i in revSentences:
            revWords += i.lower().split()
        vec = 0
        for i in revWords:
            try:
                vec+=model[i]/2
            except:
                pass
        vec=vec/len(revWords)
        sds.addSample(vec,review[0])
    net = buildNetwork(100, 20, 1, hiddenclass=TanhLayer, outclass=SoftmaxLayer,bias=True)
    trainer = BackpropTrainer(net, sds)
    print "Error score:",trainer.train()
    print trainer.trainUntilConvergence(verbose=True,maxEpochs=100)
ngrams.py 文件源码 项目:newspapers 作者: dhh16 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def token_func(input_string):
    tokens = nltk.word_tokenize(input_string)
    long_tokens = []
    refined_tokens = []
    # lemmatized_tokens = []
    stopwordlist = get_stopwordlist("../data/first_stopwordlist.txt")
    regex = re.compile('[^1-9a-zA-Z]')

    for token in tokens:
        token = regex.sub('', token)
        if len(token) > 3:
            long_tokens.append(token)
    lemmatized_tokens = dhh_preprocess_tools.hfst_words(long_tokens,
                                                        filter=('VERB',
                                                                'NOUN',
                                                                'ADJ',
                                                                'PROPN'))

    for token in lemmatized_tokens:
        token = token.lower()
        if token not in stopwordlist:
            refined_tokens.append(token)
    return refined_tokens
AKE.py 文件源码 项目:NLP-Keyword-Extraction-Ensemble-Method 作者: Ashwin-Ravi 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]
textcat.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
data.py 文件源码 项目:thesis 作者: jonvet 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def txt_to_sent(sentences, word_vec, tokenize=True):

    sentences = [['<s>']+s.split()+['</s>'] if not tokenize else ['<s>']+nltk.word_tokenize(s)+['</s>'] for s in sentences]
    n_w = np.sum([len(x) for x in sentences])

    # filters words without glove vectors
    for i in range(len(sentences)):
        s_f = [word for word in sentences[i] if word in word_vec]
        if not s_f:
            import warnings
            warnings.warn('No words in "{0}" (idx={1}) have glove vectors. Replacing by "</s>"..'.format(sentences[i], i))
            s_f = ['</s>']
        sentences[i] = s_f

    lengths = np.array([len(s) for s in sentences])
    n_wk = np.sum(lengths)

    print('Nb words kept : {0}/{1} ({2} %)'.format(n_wk, n_w, round((100.0 * n_wk) / n_w, 2)))

    return sentences
language_model.py 文件源码 项目:facebook-message-analysis 作者: szheng17 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, text):
        self.text = text
        self.tokens = nltk.word_tokenize(text)
        self.lowercase_tokens = [t.lower() for t in self.tokens]
        self.alpha_tokens = [t for t in self.lowercase_tokens if t.isalpha()]
sent-thoughts-parse.py 文件源码 项目:Deep-Learning-with-Keras 作者: PacktPublishing 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def maybe_build_vocab(reuters_dir, vocab_file):
    vocab = collections.defaultdict(int)
    if os.path.exists(vocab_file):
        fvoc = open(vocab_file, "rb")
        for line in fvoc:
            word, idx = line.strip().split("\t")
            vocab[word] = int(idx)
        fvoc.close()
    else:
        counter = collections.Counter()
        num_docs_read = 0
        for doc in stream_reuters_documents(reuters_dir):
            if num_docs_read % 100 == 0:
                print("building vocab from {:d} docs"
                    .format(num_docs_read))
            topics = doc["topics"]
            if len(topics) == 0:
                continue
            title = doc["title"]
            body = doc["body"]
            title_body = ". ".join([title, body]).lower()
            for sent in nltk.sent_tokenize(title_body):
                for word in nltk.word_tokenize(sent):
                    counter[word] += 1
            for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
                vocab[c[0]] = i + 1
            num_docs_read += 1
        print("vocab built from {:d} docs, complete"
            .format(num_docs_read))
        fvoc = open(vocab_file, "wb")
        for k in vocab.keys():
            fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
        fvoc.close()
    return vocab
sent-thoughts-parse.py 文件源码 项目:Deep-Learning-with-Keras 作者: PacktPublishing 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def build_numeric_text(vocab, text):
    wids = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            wids.append(vocab[word])
    return ",".join([str(x) for x in wids])


##################### main ######################
mem-network.py 文件源码 项目:Deep-Learning-with-Keras 作者: PacktPublishing 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def get_maxlens(train_data, test_data):
    story_maxlen, question_maxlen = 0, 0
    for stories, questions, _ in [train_data, test_data]:
        for story in stories:
            story_len = 0
            for sent in story:
                swords = nltk.word_tokenize(sent)
                story_len += len(swords)
            if story_len > story_maxlen:
                story_maxlen = story_len
        for question in questions:
            question_len = len(nltk.word_tokenize(question))
            if question_len > question_maxlen:
                question_maxlen = question_len
    return story_maxlen, question_maxlen
mem-network.py 文件源码 项目:Deep-Learning-with-Keras 作者: PacktPublishing 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def vectorize(data, word2idx, story_maxlen, question_maxlen):
    Xs, Xq, Y = [], [], []
    stories, questions, answers = data
    for story, question, answer in zip(stories, questions, answers):
        xs = [[word2idx[w.lower()] for w in nltk.word_tokenize(s)] 
                                   for s in story]
        xs = list(itertools.chain.from_iterable(xs))
        xq = [word2idx[w.lower()] for w in nltk.word_tokenize(question)]
        Xs.append(xs)
        Xq.append(xq)
        Y.append(word2idx[answer.lower()])
    return pad_sequences(Xs, maxlen=story_maxlen),\
           pad_sequences(Xq, maxlen=question_maxlen),\
           np_utils.to_categorical(Y, num_classes=len(word2idx))
QnARecurAtteLatest2GRU.py 文件源码 项目:recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
QnARecurAtteLatest3Atten.py 文件源码 项目:recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]


问题


面经


文章

微信
公众号

扫码关注公众号