python类wordpunct_tokenize()的实例源码

neuagent.py 文件源码 项目:WebNav 作者: nyu-dl 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def vis_att(pages_idx, query, alpha, wiki, vocab, idx):
    rows = [prm.root_page.title()]
    for pageidx in pages_idx[:-1]:
        if pageidx != -1:
            rows.append(wiki.get_article_title(pageidx).decode('utf-8', 'ignore').title())
        else:
            break
            #rows.append('Stop')

    rows = rows[::-1]

    columns = []
    for word in wordpunct_tokenize(query):
        if word.lower() in vocab:
            columns.append(str(word))
    columns = columns[:prm.max_words_query*prm.n_consec]

    alpha = alpha[:len(rows),:len(columns)]
    alpha = alpha[::-1]

    fig,ax=plt.subplots(figsize=(27,10))
    #Advance color controls
    norm = matplotlib.colors.Normalize(0,1)
    im = ax.pcolor(alpha,cmap=plt.cm.gray,edgecolors='w',norm=norm)
    fig.colorbar(im)
    ax.set_xticks(np.arange(0,len(columns))+0.5)
    ax.set_yticks(np.arange(0,len(rows))+0.5)
    ax.tick_params(axis='x', which='minor', pad=15)
    # Here we position the tick labels for x and y axis
    ax.xaxis.tick_bottom()
    ax.yaxis.tick_left()
    ax.axis('tight') # correcting pyplot bug that add extra white columns.
    plt.xticks(rotation=90)
    fig.subplots_adjust(bottom=0.2)
    fig.subplots_adjust(left=0.2)
    #Values against each labels
    ax.set_xticklabels(columns,minor=False,fontsize=18)
    ax.set_yticklabels(rows,minor=False,fontsize=18)
    plt.savefig('vis' + str(idx) + '.svg')
    plt.close()
utils.py 文件源码 项目:WebNav 作者: nyu-dl 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask
utils.py 文件源码 项目:WebNav 作者: nyu-dl 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def Word2Vec_encode(texts, wemb):

    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out
rake.py 文件源码 项目:rake-nltk 作者: csurfer 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def _generate_phrases(self, sentences):
        """Method to generate contender phrases given the sentences of the text
        document.

        :param sentences: List of strings where each string represents a
                          sentence which forms the text.
        :return: Set of string tuples where each tuple is a collection
                 of words forming a contender phrase.
        """
        phrase_list = set()
        # Create contender phrases from sentences.
        for sentence in sentences:
            word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
            phrase_list.update(self._get_phrase_list_from_words(word_list))
        return phrase_list
tree_chart.py 文件源码 项目:baal 作者: braingineer 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def _on_start(self, utterance):
        # do all on start things
        # maybe clear all chart data structures
        # maybe clear agenda data structures
        self.agenda.clear()
        tokenized_utterance = tokenizer(utterance)
        self.utter_len = self.settings.utter_len = len(tokenized_utterance)
        self.left_buckets = [set() for _ in xrange(self.utter_len+1)]
        self.right_buckets = [set() for _ in xrange(self.utter_len+1)]
        self.initialize_agenda(tokenized_utterance)
        # Buckets are over dot indices, so are len=1
        # self._print_buckets()
util.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
bytileAggregator.py 文件源码 项目:project-fortis 作者: CatalystCode 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def score(self, sentence):
        # track both positive and negative scores for sentence
        pos_score, neg_score = 0., 0.
        # assuming no contextual forms are used for Arabic
        ensure_package_path()
        from nltk.tokenize import wordpunct_tokenize as tokenize
        tokens = tokenize(sentence.lower())
        term_count = 0
        # using nested while loops here to accomodate early termination of 
        # inner loop, and updating the index of the outer loop based on the
        #  number of tokens used in the sub-phrase
        i = 0
        while i < len(tokens):
            matched = False
            j = min(self.max_len, len(tokens) - i)
            # check phrase lengths up to `max_len`
            while j > 0 and (i + j) <= len(tokens):
                sub_tokens = tokens[i : i + j]
                sub_word = ' '.join(sub_tokens)
                # if a match exist for phrase, update scores and counts
                if sub_word in self.lookup:
                    sub_word_scores = self.lookup[sub_word]
                    pos_score += sub_word_scores[0]
                    neg_score += sub_word_scores[1]
                    term_count += 1
                    matched = True
                    i += j
                    break
                j -= 1
            # if not matched, skip token
            if not matched:
                i += 1
        # if no terms matched, or scores are equal, return a neutral score
        if pos_score == neg_score:
            return 0.5
        # if sentence is more positive than negative, use positive word sense
        elif pos_score > neg_score:
            return 0.5 + pos_score / term_count / 2 
        # if sentence is more negative than positive, use negative word sense
        else:
            return 0.5 - neg_score / term_count / 2
bytileAggregator.py 文件源码 项目:project-fortis 作者: CatalystCode 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def create_keyword_regex(keyword):
    print 'create_keyword_regex'
    # import nltk
    ensure_package_path()
    from nltk.tokenize import wordpunct_tokenize as tokenize
    print 'tokenize ==> %s' % (keyword)
    tokens = tokenize(keyword)
    pattern = '\\s+'.join(tokens)
    pattern = '\\b%s\\b' % pattern
    print 'compile pattern ==> %s' % (pattern)
    return re.compile(pattern, re.I | re.UNICODE)
util.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
preprocessing.py 文件源码 项目:idea_relations 作者: Noahs-ARK 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def tokenize(text, filter_stopwords=False, lowercase=True):
    words = wordpunct_tokenize(text)
    if filter_stopwords:
        words = [w for w in words if w not in STOPWORDS]
    return words
utils.py 文件源码 项目:QueryReformulator 作者: nyu-dl 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask
utils.py 文件源码 项目:QueryReformulator 作者: nyu-dl 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def Word2Vec_encode(texts, wemb):

    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out
utils.py 文件源码 项目:QueryReformulator 作者: nyu-dl 项目源码 文件源码 阅读 14 收藏 0 点赞 0 评论 0
def text2idx2(texts, vocab, dim, use_mask=False):
    '''
    Convert a list of texts to their corresponding vocabulary indexes.
    '''

    if use_mask:
        out = -np.ones((len(texts), dim), dtype=np.int32)
        mask = np.zeros((len(texts), dim), dtype=np.float32)
    else:
        out = -2 * np.ones((len(texts), dim), dtype=np.int32)

    out_lst = []
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)[:dim]

        for j, word in enumerate(words):
            if word in vocab:
                out[i,j] = vocab[word]
            else:
                out[i,j] = -1 # Unknown words

        out_lst.append(words)

        if use_mask:
            mask[i,:j] = 1.

    if use_mask:
        return out, mask, out_lst
    else:
        return out, out_lst
util.py 文件源码 项目:beepboop 作者: nicolehe 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
util.py 文件源码 项目:kind2anki 作者: prz3m 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
util.py 文件源码 项目:but_sentiment 作者: MixedEmotions 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
rhyme.py 文件源码 项目:RapBattleAlexa 作者: akashlevy 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def get_syllables(sonnet):

  from nltk.tokenize import wordpunct_tokenize
  tokens = [wordpunct_tokenize(s) for s in sonnet]
  punct = set(['.', ',', '!', ':', ';'])
  filtered = [ [w for w in sentence if w not in punct ] for sentence in tokens]
  last = [ sentence[len(sentence) - 1] for sentence in filtered]

  syllables = [[(word, len(pron), pron) for (word, pron) in cmu_dict if word == w] for w in last]
  return syllables
convert2idx.py 文件源码 项目:WebNav 作者: nyu-dl 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def compute_idx(pages_path_in, pages_path_out, vocab):


    f = h5py.File(pages_path_in, 'r')

    if prm.att_doc and prm.att_segment_type == 'sentence':
        nltk.download('punkt')
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    os.remove(pages_path_out) if os.path.exists(pages_path_out) else None

    # Save to HDF5
    fout = h5py.File(pages_path_out,'a')

    if prm.att_doc:
        shape = (f['text'].shape[0],prm.max_segs_doc,prm.max_words)
    else:
        shape=(f['text'].shape[0],prm.max_words)

    idxs = fout.create_dataset('idx', shape=shape, dtype=np.int32)
    mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)

    i = 0
    for text in f['text']:
        st = time.time()

        if prm.att_doc:
            if prm.att_segment_type.lower() == 'section' or prm.att_segment_type.lower() == 'subsection':
                segs = ['']
                for line in text.split('\n'):
                    if prm.att_segment_type == 'section':
                        line = line.replace('===', '')
                    if line.strip().startswith('==') and line.strip().endswith('=='):
                        segs.append('')
                    segs[-1] += line.lower() + '\n'
            elif prm.att_segment_type.lower() == 'sentence':
                segs = tokenizer.tokenize(text.lower().decode('ascii', 'ignore'))
            elif prm.att_segment_type.lower() == 'word':
                segs = wordpunct_tokenize(text.decode('ascii', 'ignore'))
            else:
                raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter. Valid options are "section", "subsection", "sentence", or "word".')

            segs = segs[:prm.max_segs_doc]
            idxs_, _ = utils.text2idx2(segs, vocab, prm.max_words)
            idxs[i,:len(idxs_),:] = idxs_
            mask[i] = len(idxs_)
        else:
            idx, _ = utils.text2idx2([text.lower()], vocab, prm.max_words)
            idxs[i,:] = idx[0]
        i += 1

        #if i > 3000:
        #    break

        print 'processing article', i, 'time', time.time()-st

    f.close()
    fout.close()
simple_search.py 文件源码 项目:WebNav 作者: nyu-dl 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def get_candidates(qatp):

    print 'loading data...'
    idf = pkl.load(open(prm.idf_path, "rb"))
    wk = wiki.Wiki(prm.pages_path)

    print 'creating vocabulary...'
    vocab = {}
    for q,_,_,_ in qatp:
        words = wordpunct_tokenize(q.lower())
        for word in words:
            if word in idf:
                vocab[word] = {}


    print 'creating inverted index...'
    i = 0
    for text in wk.get_text_iter():
        if i%10000==0:
            print 'article', i
        words = wordpunct_tokenize(text.lower())
        for word in words:
            if word in vocab:
                vocab[word][i] = 0

        #if i > 500000:
        #    break
        i += 1

    print 'selecting pages...'
    candidates = []
    for i,[q,_,_,_] in enumerate(qatp):
        st = time.time()
        words = wordpunct_tokenize(q.lower())
        scores = {}

        for word in words:
            if word in vocab:
                if len(vocab[word]) < 100000:
                    for pageid in vocab[word].keys(): 
                        if pageid not in scores:
                            scores[pageid] = 0.
                        scores[pageid] += idf[word]
        idxs = np.argsort(np.asarray(scores.values()))[::-1]

        pages = scores.keys()

        if len(idxs)==0:
            print 'error question:', q

        c = OrderedDict()
        for idx in idxs[:prm.max_candidates]:
            c[pages[idx]] = 0

        candidates.append(c)
        print 'sample ' + str(i) + ' time ' + str(time.time()-st)

        #if i > 10000:
        #    break

    return candidates


问题


面经


文章

微信
公众号

扫码关注公众号