python类FreqDist()的实例源码-面圈网

language_model.py 文件源码项目：facebook-message-analysis 作者: szheng17 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def get_user_to_word_proportion(user_to_text, word):
    """
    Maps each user to the proportion of his words that consist of a specificied
    word.
    """
    user_to_word_proportion = {}
    for user in user_to_text:
        lm = LanuageModel(user_to_text[user])
        n_tokens = len(lm.lowercase_tokens)
        if n_tokens > 0:
            fd = nltk.FreqDist(lm.lowercase_tokens)
            user_to_word_proportion[user] = fd[word] / float(n_tokens)
        else:
            user_to_word_proportion[user] = 0.0
        print 'Finished user {}'.format(user.encode('utf-8'))
    return user_to_word_proportion

deep_throat.py 文件源码项目：deep_throat 作者: wdbm 项目源码文件源码阅读 52 收藏 0 点赞 0 评论 0

def most_frequent_Brown_Corpus_words():
    import nltk
    import nltk.corpus
    words = []
    for word in nltk.corpus.brown.words():
        if word not in [
            ",",
            ".",
            "``",
            "''",
            ";",
            "?",
            "--",
            ")",
            "(",
            ":",
            "!"
        ]:
            words.append(word.lower())
    frequencies_words = nltk.FreqDist(words).most_common()
    words_most_frequent = [word[0] for word in frequencies_words]
    return words_most_frequent

keyword_extractor.py 文件源码项目：resume-optimizer 作者: mhbuehler 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def _calculate_word_scores(self, phrase_list):
        """Scores words according to frequency and tendency to appear in multi-word key phrases"""
        word_freq = nltk.FreqDist()
        word_multiplier = nltk.FreqDist()
        for phrase in phrase_list:
            # Give a higher score if word appears in multi-word candidates
            multi_word = min(2, len(filter(lambda x: not is_numeric(x), phrase)))
            for word in phrase:
                # Normalize by taking the stem
                word_freq[stem(word)] += 1
                word_multiplier[stem(word)] += multi_word
        for word in word_freq.keys():
            word_multiplier[word] = word_multiplier[word] / float(word_freq[word])  # Take average
        word_scores = {}
        for word in word_freq.keys():
            word_scores[word] = word_freq[word] * word_multiplier[word]

        return word_scores

1-train-CBOW.py 文件源码项目：Deep-Learning-with-Theano 作者: PacktPublishing 项目源码文件源码阅读 52 收藏 0 点赞 0 评论 0

def build_dictionary(words, max_df=5):

    word_freq = [[unkown_token, -1], [pad_token, 0]]
    word_freq.extend(nltk.FreqDist(itertools.chain(words)).most_common())
    word_freq = OrderedDict(word_freq)
    word2idx = {unkown_token: 0, pad_token: 1}
    idx2word = {0: unkown_token, 1: pad_token}
    idx = 2
    for w in word_freq:
      f = word_freq[w]
      if f >= max_df:
        word2idx[w] = idx
        idx2word[idx] = w
        idx += 1
      else:
        word2idx[w] = 0 # map the rare word into the unkwon token
        word_freq[unkown_token] += 1 # increment the number of unknown tokens

    return word2idx, idx2word, word_freq

word_cluster.py 文件源码项目：PolBotCheck 作者: codeforfrankfurt 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def get_corpus_of_most_active_users(n_users=5):
    tweets = []
    texts = []
    with open(DATASET_PATH) as f:
        for line in f:
            tweets.append(json.loads(line)['user']['screen_name'])
            texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text']))

    users = nltk.FreqDist(tweets).most_common(n_users)

    dict = {}
    for user, tweet in texts:
        if user in dict:
            dict[user] = " ".join([dict[user],tweet])
        else:
            dict[user] = tweet

    corpus = [dict[name] for name, _ in users]
    user_names = [name for name, _ in users]
    return  corpus, user_names

textcat.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

dialogue.py 文件源码项目：PyTrafficCar 作者: liyuming1978 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def load_data():
    global N, words, labels

    posts = corpus.xml_posts()[:10000]
    freqs = [ FreqDist(post.text) for post in posts ] 
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    labels = list(set([ post.get('class') for post in posts ]))

    data = []
    N = len(words)
    for post, dist in zip(posts, freqs):
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, labels.index(post.get('class'))))

    return data

similarity.py 文件源码项目：PyTrafficCar 作者: liyuming1978 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        x = volumize(dist)
        data.append((x, x.w))

    return data

similarity.py 文件源码项目：PyTrafficCar 作者: liyuming1978 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def test(): 
    gt = GetTweets()
    documents = gt.get_hashtag('ferguson', count=20)
    documents += gt.get_hashtag('police', count=21)
    print 'Query:', documents[-1]

    tokenizer = RegexpTokenizer('\w+')
    vols = []
    for doc in documents:
        samples = []
        for token in tokenizer.tokenize(doc):
            word = token.lower()
            if word not in ENGLISH_STOP_WORDS and word not in punctuation:
                samples.append(word)
        vols.append(volumize(FreqDist(samples)))

    vectors = [ doc_code(v) for v in vols[:-1] ]
    query_vec = doc_code(vols[-1])

    sims = [ cos(v, query_vec) for v in vectors ]
    m = max(sims)
    print m, documents[sims.index(m)]

topics.py 文件源码项目：PyTrafficCar 作者: liyuming1978 项目源码文件源码阅读 48 收藏 0 点赞 0 评论 0

def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data

train.py 文件源码项目：poetic-inner-join 作者: emdaniels 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def tokenize_sentences(self):
        # tokenize the sentences into words and count the word frequencies
        # get most common words, build index_to_word and word_to_index vectors
        self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
                                    self.sentences]
        word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
        print("Found %d unique word tokens." % len(word_freq.items()))

        vocab = word_freq.most_common(self.vocabulary_size - 1)
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict(
            [(w, i) for i, w in enumerate(self.index_to_word)])

        print("Using vocabulary size %d." % self.vocabulary_size)
        print(
            "The least frequent word is '%s' appearing %d times." % (
            vocab[-1][0], vocab[-1][1]))

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(self.tokenized_sentences):
            self.tokenized_sentences[i] = [
                w if w in self.word_to_index else self.unknown_token for w in
                sent]

textcat.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 47 收藏 0 点赞 0 评论 0

def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

textcat.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

textcat.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

iwords.py 文件源码项目：ip6words 作者: lstn 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def load_words(num_words):

    words = get_words_from_nltk()
    fdist = nltk.FreqDist(words)

    fdistmc = fdist.most_common()

    nd = OrderedDict()
    nda = []

    occurences = set([wt[1] for wt in fdistmc])
    occurences = sorted(occurences, key=int, reverse=True)

    for idx in occurences:
        nd[idx] = sorted([wt[0] for wt in fdistmc if wt[1] == idx])

    for key, val in nd.items():
        nda += val

    words = nda[:num_words]
    return words

explore.py 文件源码项目：OpinionMining728 作者: stasi009 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def statistics_by_aspect():
    filename = "aspects_train.csv"
    words_dist = nltk.ConditionalFreqDist()
    sample_sizes = nltk.FreqDist()

    samples_stream = get_samples_stream(filename)
    for aspect,words in samples_stream:
        sample_sizes[aspect] += 1
        for word in words:
            words_dist[aspect][word] += 1

    for category,dist in words_dist.iteritems():
        print "\n------- Category: {}".format(category)
        print dist.most_common(20)

    total_samples = sample_sizes.N()
    print "\ntotally {} samples".format(total_samples)
    for aspect, count in sample_sizes.iteritems():
        print "aspect[{}] has {} samples, {:.2f}%".format(aspect,count, count*100.0/total_samples)

topic_models.py 文件源码项目：OpinionMining728 作者: stasi009 项目源码文件源码阅读 58 收藏 0 点赞 0 评论 0

def save_topics(model,filename):
    with open(filename,"wt") as outf:
        # ---------- write each topic and words' contribution
        topics = model.show_topics(num_topics=-1, log=False, formatted=True)
        for topic in topics:
            # topic[0]: topic number
            # topic[1]: topic description
            outf.write("\n############# TOPIC {} #############\n".format(topic[0]))
            outf.write(topic[1]+"\n")

        # ---------- words statistics in all topics
        outf.write("\n\n\n****************** KEY WORDS ******************\n")
        topics = model.show_topics(num_topics=-1, log=False, formatted=False)
        keywords = (word for (_,words) in topics for (word,score) in words)

        fdist = nltk.FreqDist(keywords)
        for index,(w,c) in enumerate( fdist.most_common(100) ):
            outf.write("{}-th keyword: <{},{}>\n".format(index+1,w,c))

textcat.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

utils.py 文件源码项目：nlpSentiment 作者: ClimbsRocks 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def createPopularWords(combined, lowerBound, upperBound):
    allWords = []
    for message in combined:
        for word in message[0]:
            allWords.append(word)

    allWords = nltk.FreqDist(allWords)


    # grab the top several thousand words, ignoring the lowerBound most popular
    # grabbing more words leads to more accurate predictions, at the cost of both memory and compute time
    # ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall
    popularWords = []
    wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound]
    for pair in wordsToUse:
        popularWords.append(pair[0])

    return popularWords


# extract features from a single document in a consistent manner for all documents in a corpus
# simply returns whether a given word in popularWords is included in the document

textcat.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

textcat.py 文件源码项目：kind2anki 作者: prz3m 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

textcat.py 文件源码项目：but_sentiment 作者: MixedEmotions 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

PreprocessManager.py 文件源码项目：GitHub-Recommender 作者: himangshunits 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def get_word_counts(input_str, limit = 100):
        input_str = PreprocessManager.remove_non_ascii(input_str)
        wordnet_lemmatizer = WordNetLemmatizer()
        snowball_stemmer = EnglishStemmer()
        tokenized_text = CountVectorizer().build_tokenizer()(input_str.lower())
        tokenized_text = [word for word in tokenized_text if len(word) > 1]  # Filter some small words
        #tokenized_text = [word for word in tokenized_text if not word.isnumeric()]
        filtered_words = [word for word in tokenized_text if word not in stopwords.words('english')]
        stemmed_list = [wordnet_lemmatizer.lemmatize(w) for w in filtered_words]
        # Calculate frequency distribution
        frequency_dist = nltk.FreqDist(stemmed_list)

        # Output top 50 words
        result = dict()
        for word, frequency in frequency_dist.most_common(limit):
            # print(u'{};{}'.format(word, frequency))
            result[word] = frequency
        return result



    # This function just splits the words and gives the words that's all!

raw_analysis.py 文件源码项目：AirbnbReviewAnalyzer 作者: mrsata 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def analysis(reviews_collection_text):
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        raw_data = f.read()
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        comments = f.readlines()
    data = raw_data.replace('\n', ' ')
    data_lower = data.lower()
    tokens_with_punc = word_tokenize(data_lower)
    tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
    print("--- Most frequent tokens ---\n",
        FreqDist(tokens_with_punc).most_common(15))
    print("--- Tokens without punctuation ---\n",
        FreqDist(tokens).most_common(15))
    stop = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop]
    print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
    tagged = pos_tag(words)
    nouns = [word for word, pos in tagged if (pos == 'NN')]
    print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
    adjts = [word for word, pos in tagged if (pos == 'JJ')]
    print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
    tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
    lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
    avgld = sum(lxdst) / len(comments)
    print("--- Average lexical density ---\n", avgld)

language_model.py 文件源码项目：facebook-message-analysis 作者: szheng17 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def plot_common_tokens(self, n_tokens):
        # Remove common stopwords
        fd = nltk.FreqDist(w for w in self.alpha_tokens if w not in s)
        fd.plot(n_tokens)

language_model.py 文件源码项目：facebook-message-analysis 作者: szheng17 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def get_user_to_word_count(user_to_text, word):
    user_to_word_count = {}
    for user in user_to_text:
        lm = LanuageModel(user_to_text[user])
        fd = nltk.FreqDist(lm.lowercase_tokens)
        user_to_word_count[user] = fd[word]
    return user_to_word_count

Preprocessor.py 文件源码项目：NLP-JD 作者: ZexinYan 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def sentence2vec(self, sentence):
        if len(self.features) == 0:
            self.load_feature_model()
        seg_list = jieba.cut(sentence, False)
        freq_dist = nltk.FreqDist(seg_list)
        local_list = []
        for each in self.features:
            local_list.append(freq_dist[each])
        return local_list

Preprocessor.py 文件源码项目：NLP-JD 作者: ZexinYan 项目源码文件源码阅读 47 收藏 0 点赞 0 评论 0

def get_freq_dist(self, seg_list):
        freq_dist = []
        for each in seg_list:
            freq_dist.append(nltk.FreqDist(each))
        return freq_dist

training_data.py 文件源码项目：vanilla-neural-nets 作者: cavaunpeu 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def _remove_uncommon_words(cls, tokenized_corpus, vocabulary_size):
        word_count = nltk.FreqDist( itertools.chain(*tokenized_corpus) )
        word_count = [cls.WORD_COUNT_ITEM(word=word, count=count) for word, count in word_count.items()]
        word_count = sorted(word_count, key=lambda item: (item.count, item.word), reverse=True)
        most_common_words = [word_count_item.word for word_count_item in word_count[:vocabulary_size - \
            cls.NUMBER_OF_WORDS_TO_ADD_IN_MANUALLY + 1]]

        tokenized_corpus = [
            [word if word in most_common_words else cls.UNKNOWN_TOKEN for word in sentence]\
            for sentence in tokenized_corpus
        ]
        return tokenized_corpus

load.py 文件源码项目：Deep-Learning-with-Theano 作者: PacktPublishing 项目源码文件源码阅读 49 收藏 0 点赞 0 评论 0

def parse_text(filename, vocabulary_size=9000, type="word"):
    with open(filename, 'rb') as f:
        txt = f.read()
        if type == "word":
            sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' '))
            # sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
            tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
            word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
            print("Found %d unique words tokens." % len(word_freq.items()))
            vocab = word_freq.most_common(vocabulary_size-1)
            index = [sentence_start_token, sentence_end_token, unknown_token] + [x[0] for x in vocab]
            word_to_index = dict([(w,i) for i,w in enumerate(index)])
            print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
            for i, sent in enumerate(tokenized_sentences):
                tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
            X_train = np.asarray([ [0]+[word_to_index[w] for w in sent] for sent in tokenized_sentences])
            y_train = np.asarray([ [word_to_index[w] for w in sent]+[1] for sent in tokenized_sentences])
            # X_train, y_train = [], []
            # for sent in tokenized_sentences:
            #     l = len(sent) - 1
            #     X_train.append(coo_matrix((np.ones( (l) ), ( range(l), [word_to_index[w] for w in sent[:-1]] )), shape=(l, vocabulary_size )).toarray())
            #     y_train.append( [word_to_index[w] for w in sent[1:] ] )
        else:
            sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' '))
            index = ['^','$'] + list(set(txt))
            char_to_index = dict([(w,i) for i,w in enumerate(index)])
            X_train = np.asarray([ [0]+[ char_to_index[w] for w in sent]  for sent in sentences])
            y_train = np.asarray([ [ char_to_index[w] for w in sent]+[1] for sent in sentences])

    return X_train, y_train, index