python类word_tokenize()的实例源码

1.6 Mil.py 文件源码 项目:NLTK_SentimentAnalysis_TensorFlow 作者: rachit-mishra 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def create_lexicon(fin):
    lexicon = []
    with open(fin, 'r', buffering=100000, encoding ='latin-1') as f:
        try:
            counter = 1
            content = ''
            for line in f:
                counter+=1
                if(counter/2500.0).is_integer():
                    tweet=line.split(':::')[1]

                    content+= ' '+tweet
                    words = word_tokenize(content)
                    words = [lemmatizer.lemmatize(i) for i in words]
                    lexicon = list(set(lexicon + words))
                    print(counter, len(lexicon))
        except Exception as e:
            print(str(e))
    with open('lexicon.pickle', 'wb') as f:
        pickle.dump(lexicon, f)
1.6 Mil.py 文件源码 项目:NLTK_SentimentAnalysis_TensorFlow 作者: rachit-mishra 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def convert_to_vec(fin, fout, lexicon_pickle):
    with open(lexicon_pickle, 'rb') as f:
        lexicon = pickle.load(f)
    outfile = open(fout, 'a')
    with open(fin, buffering= 20000, encoding = 'latin-1') as f:
        counter = 0
        for line in f:
            counter +=1
            label = line.split(':::')[0]
            tweet = line.split(':::')[1]
            current_words = word_tokenize(tweet.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]

            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] +=1
            features = list(features)
            outline = str(features)+'::'+str(label)+ '\n'
            outfile.write(outline)
        print(counter)
TF_own_data_model.py 文件源码 项目:NLTK_SentimentAnalysis_TensorFlow 作者: rachit-mishra 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def sample_handling(sample, lexicon, classification):
    featureset = []  # [1 0] pos sentiment [0 1] negative sentiment
    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            #print(features)
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    # like the example discussed earlier
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])
            #print(featureset)

    return featureset
corpus_cleaner.py 文件源码 项目:acl2017-interactive_summarizer 作者: UKPLab 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def runprops_data(self, docs):
        new_docs = []
        for doc_name, doc in docs:
            print 'Processing:', doc_name
            doc_new = []
            doc = self.props_exception(doc_name, doc)

            for index, sent in enumerate(doc):
                doc_new.append(' '.join(word_tokenize(sent)))
                print index+1, doc_new[index]

            triples = []
            for i, sent in enumerate(doc_new):
                try:
                    tmp_triples = self.props_parser.extract_triples([sent])
                    triples.append(tmp_triples)
                except:
                    print('Error: failed for line %s' % (sent))
                    continue
            parse_sents = create_trees(triples, doc_new)
            sents = []
            new_docs.append((doc_name, parse_sents))
        return new_docs
question_answers1.py 文件源码 项目:NLP_question_answering_system_project 作者: Roshrini 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def wordMatch(question, line, storyPOS_dict):
    wordsInAQuestion = word_tokenize(question)
    rootsInAQuestion = set()
    for word in wordsInAQuestion:
        root = lancaster_stemmer.stem(word)
        rootsInAQuestion.add(root)

    if line in storyPOS_dict:
        verbmatch_score = 0
        rootmatch_score = 0
        scoreOfALine = {}
        for (word,tag) in storyPOS_dict[line]:
            if 'V' in tag:
                verb_root = lancaster_stemmer.stem(word)
                if verb_root in rootsInAQuestion:
                    verbmatch_score = verbmatch_score + 6
            else:
                word_root = lancaster_stemmer.stem(word)
                if word_root in rootsInAQuestion:
                    rootmatch_score = rootmatch_score + 3
        scoreOfALine[line] = rootmatch_score + verbmatch_score
        return rootmatch_score + verbmatch_score
useful_functions.py 文件源码 项目:scientific-paper-summarisation 作者: EdCo95 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def preprocess_sentence(sentence):
    """
    Preprocesses a sentence, turning it all to lowercase and tokenizing it into words.
    :param sentence: the sentence to pre-process.
    :return: the sentence, as a list of words, all in lowercase
    """
    sentence = sentence.lower()
    return word_tokenize(sentence)
useful_functions.py 文件源码 项目:scientific-paper-summarisation 作者: EdCo95 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def create_paper_dictionaries(filename="", readin=True, paper=None):
    """
    Creates the metadata data structures for a specific paper required to compute the extra features which are
    appended to the sentence vector.
    :param filename: the filename only, not the path, for the paper to create dictionaries for.
    :return: a tuple of the metadata data structures for the paper.
    """

    if readin and filename != "":
        # Read the paper in as a dictionary, keys are sections and values are the section text
        paper = read_in_paper(filename)

    # Extract paper keyphrases
    keyphrases = set(filter(None, " ".join(paper["KEYPHRASES"].lower().split("\n")).split(" ")))

    # Get the paper's vocab
    full_paper = " ".join([val for _, val in paper.iteritems()]).lower()
    paper_words = word_tokenize(full_paper)
    vocab = set(paper_words)

    # Create a bag of words for the paper
    paper_bag_of_words = defaultdict(int)
    for word in paper_words:
        paper_bag_of_words[word] += 1

    # Get the title words
    title_words = set([x.lower() for x in word_tokenize(paper["MAIN-TITLE"]) if x not in STOPWORDS])

    return keyphrases, vocab, paper_bag_of_words, title_words
skipthoughts.py 文件源码 项目:TAC-GAN 作者: dashayushman 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X
skipthoughts.py 文件源码 项目:how_to_convert_text_to_images 作者: llSourcell 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X
weather_adapter.py 文件源码 项目:chatterbot-weather 作者: gunthercox 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_latitude(self, user_input):
        """
        Returns the latitude extracted from the input.
        """
        from nltk import tokenize

        for token in tokenize.word_tokenize(user_input):
            if 'latitude=' in token:
                return re.sub('latitude=', '', token)

        return ''
weather_adapter.py 文件源码 项目:chatterbot-weather 作者: gunthercox 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_longitude(self, user_input):
        """
        Returns the longitude extracted from the input.
        """
        from nltk import tokenize

        for token in tokenize.word_tokenize(user_input):
            if 'longitude=' in token:
                return re.sub('longitude=', '', token)

        return ''
recipe_cleanup.py 文件源码 项目:Flavor-Network 作者: lingcheng99 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def split_ingr(x):
    wnl=WordNetLemmatizer()
    cleanlist=[]
    lst = x.strip('[]').split(',')
    cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
    return cleanlist

#remove low-information words from ingredients, could use more
embeddings_dict.py 文件源码 项目:deeppavlov 作者: deepmipt 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def add_items(self, sentence_li):
        """Add new items to the tok2emb dictionary from a given text."""

        for sen in sentence_li:
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                if self.tok2emb.get(tok) is None:
                    self.tok2emb[tok] = self.fasttext_model[tok]
views.py 文件源码 项目:Django-Basic-Sentiment 作者: enriksabalvaro 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def sentiment(request):


    open_file = open("wordfeature5k.pickle","rb")
    word_features = pickle.load(open_file)
    open_file.close()


    def find_features(document):
        words = word_tokenize(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
        return features

    open_file = open("naivebayesclassifier.pickle","rb")
    classifier = pickle.load(open_file)
    open_file.close()

    sentence = request.POST['sentence']

    result = classifier.classify(find_features(sentence))

    if result == "positive":
        return render(request, "home/index.html",{"sentence":sentence, "positive":"positive"})
    elif result == "negative":
        return render(request, "home/index.html",{"sentence":sentence, "negative":"negative"})
preprocess.py 文件源码 项目:MatchZoo 作者: faneshion 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def word_seg_en(docs):
        docs = [word_tokenize(sent) for sent in tqdm(docs)]
        # show the progress of word segmentation with tqdm
        '''docs_seg = []
        print('docs size', len(docs))
        for i in tqdm(range(len(docs))):
            docs_seg.append(word_tokenize(docs[i]))'''
        return docs
preprocess.py 文件源码 项目:MatchZoo 作者: faneshion 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def word_seg_en(docs):
        docs = [word_tokenize(sent) for sent in tqdm(docs)]
        # show the progress of word segmentation with tqdm
        '''docs_seg = []
        print('docs size', len(docs))
        for i in tqdm(range(len(docs))):
            docs_seg.append(word_tokenize(docs[i]))'''
        return docs
models.py 文件源码 项目:SentEval 作者: facebookresearch 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_word_dict(self, sentences, tokenize=True):
        # create vocab of words
        word_dict = {}
        if tokenize:
            from nltk.tokenize import word_tokenize
        sentences = [s.split() if not tokenize else word_tokenize(s)
                     for s in sentences]
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict['<s>'] = ''
        word_dict['</s>'] = ''
        return word_dict
models.py 文件源码 项目:SentEval 作者: facebookresearch 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def visualize(self, sent, tokenize=True):
        if tokenize:
            from nltk.tokenize import word_tokenize

        sent = sent.split() if not tokenize else word_tokenize(sent)
        sent = [['<s>'] + [word for word in sent if word in self.word_vec] +
                ['</s>']]

        if ' '.join(sent[0]) == '<s> </s>':
            import warnings
            warnings.warn('No words in "{0}" have glove vectors. \
                Replacing by "<s> </s>"..'.format(sent))
        batch = Variable(self.get_batch(sent), volatile=True)

        if self.use_cuda:
            batch = batch.cuda()
        output = self.enc_lstm(batch)[0]
        output, idxs = torch.max(output, 0)
        # output, idxs = output.squeeze(), idxs.squeeze()
        idxs = idxs.data.cpu().numpy()
        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]

        # visualize model
        import matplotlib.pyplot as plt
        x = range(len(sent[0]))
        y = [100.0*n/np.sum(argmaxs) for n in argmaxs]

        plt.xticks(x, sent[0], rotation=45)
        plt.bar(x, y)
        plt.ylabel('%')
        plt.title('Visualisation of words importance')
        plt.show()

        return output, idxs
NewsArticleClass.py 文件源码 项目:Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def extractRawFrequencies(self, article):
        # this method is similar to above but returns
        # the raw freq.cies ( all word count)
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
word_splitter.py 文件源码 项目:allennlp 作者: allenai 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def split_words(self, sentence: str) -> List[Token]:
        # Import is here because it's slow, and by default unnecessary.
        from nltk.tokenize import word_tokenize
        return [Token(t) for t in word_tokenize(sentence.lower())]


问题


面经


文章

微信
公众号

扫码关注公众号