python类word_tokenize()的实例源码-面圈网

tasks.py 文件源码项目：QProb 作者: quant-trade 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def keyword_extractor(data):
    try:
        #np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data)))
        #result = np_extractor.extract()
        text = words_wo_stopwords(strip_tags(data))

        #TODO this is duplicated job, should be improved
        words = word_tokenize(strip_tags(text))
        taggged = pos_tag(words)
        cleaned = filter_insignificant(taggged)
        text = " ".join(cleaned)
        wc = WordCloudMod().generate(text)
        result = list(wc.keys())[:10]
    except Exception as err:
        print(colored.red("At keywords extraction {}".format(err)))
        result = []

    return result


# TODO definitely can be better if we knew where content is

model.py 文件源码项目：deeppavlov 作者: deepmipt 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def create_batch(self, sentence_li):
        """Create a batch for a list of sentences."""

        embeddings_batch = []
        for sen in sentence_li:
            embeddings = []
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                embeddings.append(self.embdict.tok2emb.get(tok))
            if len(tokens) < self.max_sequence_length:
                pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))]
                embeddings = pads + embeddings
            else:
                embeddings = embeddings[-self.max_sequence_length:]
            embeddings = np.asarray(embeddings)
            embeddings_batch.append(embeddings)
        embeddings_batch = np.asarray(embeddings_batch)
        return embeddings_batch

NewsArticleClass.py 文件源码项目：Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)

NewsArticleClass.py 文件源码项目：Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST

eagle.py 文件源码项目：stock-eagle 作者: mtusman 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def similarity(c1, c2):
    '''stop words are words like "it" and "the" , that have no massive impact on the 
    sentence'''
    stop_words = list(stopwords.words("english"))
    # Removes stop words in both sentences
    c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
    c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
    c1_words = Counter(dedupe(c1_cleaned))
    c2_words = Counter(dedupe(c2_cleaned))
    total_words = c1_words + c2_words
    similarity_between_words = 0
    for key, val in total_words.items():
        ''' Looks at whether the two articles share a word'''
        if total_words[key] > 1:
            similarity_between_words += 1

    return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))

dont_run_me_run_the_other_script_instead.py 文件源码项目：punctuator2 作者: ottokart 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def process_line(line):

    tokens = word_tokenize(line)
    output_tokens = []

    for token in tokens:

        if token in INS_PUNCTS:
            output_tokens.append(INS_PUNCTS[token])
        elif token in EOS_PUNCTS:
            output_tokens.append(EOS_PUNCTS[token])
        elif is_number(token):
            output_tokens.append(NUM)
        else:
            output_tokens.append(token.lower())

    return untokenize(" ".join(output_tokens) + " ")

squad_dataset_exploration.py 文件源码项目：MachineComprehension 作者: sa-j 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def check_sent(s):
    count = 0
    for r in s:
        #words = word_tokenize(r)
#        for w in words:
        for w in r:
            if type(w) != str:
                print(w)
                count += 1
                continue
            if w in inv_words or w in oov_words_in_train:
                continue
            if w not in word2vec:
                count += 1
                oov_words_in_train.add(w)
            else:
                inv_words[w] = word2vec.vocab[w].index
    return count

vqa_processed.py 文件源码项目：vqa.pytorch 作者: Cadene 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def preprocess_questions(examples, nlp='nltk'):
    if nlp == 'nltk':
        from nltk.tokenize import word_tokenize
    print('Example of generated tokens after preprocessing some questions:')
    for i, ex in enumerate(examples):
        s = ex['question']
        if nlp == 'nltk':
            ex['question_words'] = word_tokenize(str(s).lower())
        elif nlp == 'mcb':
            ex['question_words'] = tokenize_mcb(s)
        else:
            ex['question_words'] = tokenize(s)
        if i < 10:
            print(ex['question_words'])
        if i % 1000 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(examples), i*100.0/len(examples)) )
            sys.stdout.flush() 
    return examples

summarizer.py 文件源码项目：delbot 作者: shaildeliwala 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def summarize(self, text, n):
        """
          Return a list of n sentences
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx]

yodaqa2csv.py 文件源码项目：Hotpot 作者: Liang-Qiu 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def load_jacana(fname, regexen):
    samples = []
    with open(fname, 'rt') as inp:
        for line in inp:
            line = line.strip()
            if line.startswith('<Q> '):
                qorig = line[len('<Q> '):]
                q = word_tokenize(qorig)
            else:
                l = line.split(' ')
                label = int(l[0])
                kwweight = float(l[1])
                aboutkwweight = float(l[2])
                text = word_tokenize(' '.join(l[3:]))
                toklabels = regex_overlap(text, regexen[qorig])
                samples.append({'qtext': ' '.join(q), 'label': label,
                                'atext': ' '.join(text),
                                'kwweight': kwweight, 'aboutkwweight': aboutkwweight,
                                'toklabels': ' '.join([str(0+tl) for tl in toklabels])})
    return samples

loader.py 文件源码项目：Hotpot 作者: Liang-Qiu 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def load_sts(dsfile, skip_unlabeled=True):
    """ load a dataset in the sts tsv format """
    s0 = []
    s1 = []
    labels = []
    with codecs.open(dsfile, encoding='utf8') as f:
        for line in f:
            line = line.rstrip()
            label, s0x, s1x = line.split('\t')
            if label == '':
                if skip_unlabeled:
                    continue
                else:
                    labels.append(-1.)
            else:
                labels.append(float(label))
            s0.append(word_tokenize(s0x))
            s1.append(word_tokenize(s1x))
    return (s0, s1, np.array(labels))

loader.py 文件源码项目：Hotpot 作者: Liang-Qiu 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def load_quora(dsfile):
    """ load a dataset in the quora csv format """
    s0 = []
    s1 = []
    labels = []
    with open(dsfile, encoding = 'utf8') as csvfile:
        f = csv.reader(csvfile)
        firstline = True
        for line in f:
            if firstline:
                firstline = False
                continue
            s0x = line[3]
            s1x = line[4]
            label = line[5]
            labels.append(float(label))
            s0.append(word_tokenize(s0x))
            s1.append(word_tokenize(s1x))
    return (s0, s1, np.array(labels))

loader.py 文件源码项目：Hotpot 作者: Liang-Qiu 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def load_sts(dsfile, skip_unlabeled=True):
    """ load a dataset in the sts tsv format """
    s0 = []
    s1 = []
    labels = []
    with codecs.open(dsfile, encoding='utf8') as f:
        for line in f:
            line = line.rstrip()
            label, s0x, s1x = line.split('\t')
            if label == '':
                if skip_unlabeled:
                    continue
                else:
                    labels.append(-1.)
            else:
                labels.append(float(label))
            s0.append(word_tokenize(s0x))
            s1.append(word_tokenize(s1x))
    return (s0, s1, np.array(labels))

loader.py 文件源码项目：Hotpot 作者: Liang-Qiu 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def load_quora(dsfile):
    """ load a dataset in the quora csv format """
    s0 = []
    s1 = []
    labels = []
    with open(dsfile, encoding = 'utf8') as csvfile:
        f = csv.reader(csvfile)
        firstline = True
        for line in f:
            if firstline:
                firstline = False
                continue
            s0x = line[3]
            s1x = line[4]
            label = line[5]
            labels.append(float(label))
            s0.append(word_tokenize(s0x))
            s1.append(word_tokenize(s1x))
    return (s0, s1, np.array(labels))

transact.py 文件源码项目：banking-class 作者: eli-goodfriend 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def make_word_feature(df,embeddings):
    # use embeddings to vectorize merchant description
    # currently using averaging to combine words in merchant
    # there are other options: http://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence
    merchants = df.merchant.tolist()
    veclen = len(embeddings['food'])
    word_feature = np.zeros((len(merchants),veclen))
    for idx, merchant in enumerate(merchants):
        num_known = 0
        try:
            words = tokenize.word_tokenize(merchant)
            words = [word.lower() for word in words]
            for word in words:
                wordvec = embeddings[word]
                word_feature[idx,:] += wordvec
                num_known += 1
        except:
            pass
        word_feature[idx,:] = word_feature[idx,:] / float(max(num_known,1))

    return word_feature

validate.py 文件源码项目：review-classification 作者: vishnupriyam 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def predict(testSet,PP,PN,positive_probabilities,negative_probabilities,unseen_pos_prob,unseen_neg_prob):
    predicted_class = []
    for review in testSet:
        negative_probab = math.log10(PN)
        positive_probab = math.log10(PP)
        review_words = word_tokenize(review)
        for w in review_words:
            if w in negative_probabilities:
                negative_probab = negative_probab + math.log10(negative_probabilities[w])
            else:
                negative_probab = negative_probab + math.log10(unseen_neg_prob)
            if w in positive_probabilities:
                positive_probab = positive_probab + math.log10(positive_probabilities[w])
            else:
                positive_probab = positive_probab + math.log10(unseen_pos_prob)
        if(negative_probab > positive_probab):
            result = '-'
        else:
            result = '+'
        predicted_class.append(result)
    return predicted_class

Vocab.py 文件源码项目：act-rte-inference 作者: DeNeutoy 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def create_vocab(self,dataset_path, vocab_path ,max_vocab_size):

        print("generating vocab from dataset at {}".format(dataset_path))
        all_words = []
        for dataset in ["snli_1.0_train.jsonl","snli_1.0_dev.jsonl","snli_1.0_test.jsonl"]:
            for line in open(os.path.join(dataset_path, dataset),"r").readlines():
                data = json.loads(line)
                all_words += word_tokenize(data["sentence1"].lower())
                all_words += word_tokenize(data["sentence2"].lower())


        counter = Counter(all_words)
        count_pairs = sorted(counter.items(), key=lambda x : (-x[1], x[0]))

        words, _ = list(zip(*count_pairs))
        words = ["PAD"] + ["UNK"] + list(words)
        word_to_id = dict(zip(words[:max_vocab_size], range(max_vocab_size)))

        with open(vocab_path, "w") as file:
            for word, id in word_to_id.items():
                file.write("{}\t{}\n".format(word,id))

        print("vocab of size {} written to {}, with PAD token == 0, UNK token == 1".format(max_vocab_size,vocab_path))

utility_getFreqWords.py 文件源码项目：pyTextClassification 作者: tyiannak 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def getFreqWords(directoryPath):    
    files = getListOfFilesInDir(directoryPath, "*")                # get list of files in directory
    allWords = []
    count = 0
    if MAX_FILES_PER_CLASS > 0 and MAX_FILES_PER_CLASS < len(files):
        files = random.sample(files, MAX_FILES_PER_CLASS)        
    for ifile, fi in enumerate(files):                                          # for each file in current class:
        with open(fi) as f:
            content = f.read() 
            words = word_tokenize(content.decode('utf-8'))
            words = [w.lower() for w in words if w.lower() not in stop]                    
            words = list(set(words))
            allWords += words                
            count += 1
    #print allWords
    C = Counter(allWords)
    C = sorted(C.items(), key=itemgetter(1),reverse=True)        
    for c in C:
        if c[1] > 0.05 * float(count):
            print c[0], c[1] / float(count)

prepro.py 文件源码项目：mxnet-vqa 作者: shiyangdaisy23 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def prepro_question(imgs, params):

    # preprocess all the question
    print 'example processed tokens:'
    for i,img in enumerate(imgs):
        s = img['question']
        if params['token_method'] == 'nltk':
            txt = word_tokenize(str(s).lower())
        else:
            txt = tokenize(s)
        img['processed_tokens'] = txt
        if i < 10: print txt
        if i % 1000 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(imgs), i*100.0/len(imgs)) )
            sys.stdout.flush()   
    return imgs

sentiment_featureset.py 文件源码项目：tensorflow-neural-networks 作者: vipul-sharma20 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def create_lexicon(pos, neg):
    lexicon = []
    for fi in [pos, neg]:
        with open (fi, 'r') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                all_words = word_tokenize(l)
                lexicon += list(all_words)

    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)

    """
    This is done in the tutorial.
    Seems like a brute force method of removing stopwords.
    TODO: Use NLTK stopwords to remove stop words ?
    """
    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            l2.append(w)

    return l2

sentiment_featureset.py 文件源码项目：tensorflow-neural-networks 作者: vipul-sharma20 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def sample_handling(sample, lexicon, classification):
    featureset = []

    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])

    return featureset

signatures.py 文件源码项目：VecShare 作者: JaredFern 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def _avgrank_corp(inp_dir,hdv_vocab, num = 5000):
    cnt, vocab = Counter(), []
    # Counter for all words in the corpus
    for (root, dirs, files) in os.walk(inp_dir):
        files = [f for f in files if not f[0] == '.']
        for f in files:
            filepath = os.path.join(root,f)
            with codecs.open(filepath,'r', encoding="utf-8") as f:
                tok_txt = word_tokenize(f.read())
                for word in tok_txt: cnt[word] += 1
    for word in hdv_vocab:
        if word in cnt.keys(): del cnt[word]
    for word in cnt.most_common(num):
        try:    vocab.append(str(word[0]))
        except: continue
    return vocab

prepro.py 文件源码项目：VQA-tensorflow 作者: JamesChuanggg 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def prepro_question(imgs, params):

    # preprocess all the question
    print 'example processed tokens:'
    for i,img in enumerate(imgs):
        s = img['question']
        if params['token_method'] == 'nltk':
            txt = word_tokenize(str(s).lower())
        else:
            txt = tokenize(s)
        img['processed_tokens'] = txt
        if i < 10: print txt
        if i % 1000 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(imgs), i*100.0/len(imgs)) )
            sys.stdout.flush()   
    return imgs

create_sentiment_featuresets.py 文件源码项目：kaggle-youtube-8m 作者: liufuyang 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def create_lexicon(pos, neg):
    lexicon = []
    for fi in [pos, neg]:
        with io.open(fi, 'r', encoding='utf-8') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                all_words = word_tokenize(l.lower())
                lexicon += list(all_words)
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)

    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    return l2

create_sentiment_featuresets.py 文件源码项目：kaggle-youtube-8m 作者: liufuyang 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def sample_handling(sample, lexicon, classification):
    featureset = []

    with io.open(sample, 'r', encoding='utf-8') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))

            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])
    return featureset

Vectorizer.py 文件源码项目：TextClassification 作者: AlgorTroy 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def custom_tokenizer(sentence, delimiters=['|', ','], remove_puncs=True, get_unique=False):
    # tokens = re.split('(\W)', sentence)
    for delimiter in delimiters:
        sentence = re.sub(re.escape(delimiter), " "+delimiter+" ", sentence)

    tokens = word_tokenize(sentence)

    # Remove duplicates
    if get_unique:
        tokens = list(set(tokens))

    if remove_puncs:
        tokens = [token for token in tokens if
                  not ((len(token.strip()) == 1) and bool(re.search("[^a-zA-Z0-9]", token)))]

    tokens = [token for token in tokens if (not bool(re.search("\s", token)) and token != '')]

    # Remove duplicates
    if get_unique:
        tokens = list(set(tokens))

    return tokens

extras.py 文件源码项目：semeval2017-scienceie 作者: UKPLab 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def offset_tokenize(text):
    tail = text
    accum = 0
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    info_tokens = []
    for tok in tokens:
        scaped_tok = re.escape(tok)
        m = re.search(scaped_tok, tail)
        start, end = m.span()
        # global offsets
        gs = accum + start
        ge = accum + end
        accum += end
        # keep searching in the rest
        tail = tail[end:]
        info_tokens.append((tok, (gs, ge)))
    return info_tokens

prepro.py 文件源码项目：Dual-Attention-Network 作者: changywtw 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def prepro_question(imgs, params):

    # preprocess all the question
    print 'example processed tokens:'
    for i,img in enumerate(imgs):
        s = img['question']
        if params['token_method'] == 'nltk':
            txt = word_tokenize(str(s).lower())
        else:
            txt = tokenize(s)
        img['processed_tokens'] = txt
        if i < 10: print txt
        if i % 100 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(imgs), i*100.0/len(imgs)) )
            sys.stdout.flush()   
    return imgs

re_util.py 文件源码项目：repeat-aft 作者: ripeta 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""):
    try:
        tagged = pos_tag(word_tokenize(sent))
        #Maybe actually better if possessives aren't included.
        #At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is
        #probably not enough information to identify a data source
        chunkParser = RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        chunks = []
        for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
            chunk = ""
            for leave in subtree.leaves():
                chunk += leave[0] + ' '
            chunks.append(chunk.strip())
        return chunked, chunks
    except Exception as e:
        print(str(e))

naivebayes.py 文件源码项目：chatbot 作者: minggli 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def train_model(documents, labels, sample_size=.3, verbose=True):

    if verbose:
        print('starting to generate training data...', end='', flush=True)

    labeled_feature_set = list()
    for n, doc in enumerate(documents):
        feature = word_tokenize(' '.join(doc))
        label = labels[n]
        resampled = resample(feature, label, sample_size)
        labeled_feature_set += resampled

    if verbose:
        print('done', flush=True)
        print('training model...this may take a few minutes.',
              flush=True, end='')

    trained_model = NaiveBayesClassifier.train(iter(labeled_feature_set))

    if verbose:
        print('done', flush=True)
    return trained_model