python类words()的实例源码-第2页-面圈网

pipeline_factory.py 文件源码项目：UrbanSearch 作者: urbansearchTUD 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def get_binary(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                                                      epsilon=0.1,
                                                      eta0=0.0,
                                                      fit_intercept=True,
                                                      l1_ratio=0.15,
                                                      learning_rate='optimal',
                                                      loss='log',
                                                      n_iter=10,
                                                      n_jobs=1,
                                                      penalty='l2',
                                                      power_t=0.5,
                                                      random_state=None,
                                                      shuffle=True,
                                                      verbose=0,
                                                      warm_start=False
            )))
        ])

pipeline_factory.py 文件源码项目：UrbanSearch 作者: urbansearchTUD 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def get_sgdc(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', SGDClassifier(alpha=0.0001,
                                  average=False,
                                  class_weight=None,
                                  epsilon=0.1,
                                  eta0=0.0,
                                  fit_intercept=True,
                                  l1_ratio=0.15,
                                  learning_rate='optimal',
                                  loss='log',
                                  n_iter=10,
                                  n_jobs=1,
                                  penalty='l2',
                                  power_t=0.5,
                                  random_state=None,
                                  shuffle=True,
                                  verbose=0,
                                  warm_start=False))
        ])

BM25.py 文件源码项目：QAServer 作者: fssqawj 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def wash(fileList):
    # denyPos = ['CC', 'CD', 'DT', 'TO', '']
    st = LancasterStemmer()
    for f in tqdm(fileList):
        fr = open('./washFile/' + f, 'r')
        fw = open("./washFile_stem/" + f, 'w')
        for line in fr.read().splitlines():
            line = remove_punctuation(line).lower()
            # wordpos = pos(remove_punctuation(line).lower())
            # for turple in wordpos:
            #     if (turple[0] not in stopwords.words('english')):
            #         fw.write(turple[0] + ' ')
            # fw.write(x + ' ' for x in line.split() if x not in stopwords.words('english'))
            # stopw = stopwords.words('english')
            words = [x for x in line.split()]
            for x in words:
                try:
                    fw.write(st.stem(x) + ' ')
                except:
                    print x

        fr.close()
        fw.close()

run.py 文件源码项目：twitter_mongodb_helper 作者: IDEA-NTHU-Taiwan 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def count_entries(file_list):
    """Performs a count of the number of number of words in the corpus
     Args:
        file_list  (list): list of file names.

    Returns:
        list: A list of json objects containing the count per file name
    """
    result = []
    for obj in file_list:
        with open(CSV_PATH + obj + '.csv', "r") as entry:
            reader = csv.reader(entry, delimiter=",")
            col_count = len(reader.next())
            res = {"Filename": obj, "Count": col_count}
            result.append(res)
    return result

data_util.py 文件源码项目：quora_duplicate 作者: ijinmao 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list, (sentence_len, word_len)

    # Output shape
        (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs

featx.py 文件源码项目：Natural-Language-Processing-Python-and-NLTK 作者: PacktPublishing 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)

    return high_info_words

word2vec_skipgram.py 文件源码项目：TensorFlow-Machine-Learning-Cookbook 作者: PacktPublishing 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def build_dictionary(sentences, vocabulary_size):
    # Turn sentences (list of strings) into lists of words
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]

    # Initialize list of [word, word_count] for each word, starting with unknown
    count = [['RARE', -1]]

    # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))

    # Now create the dictionary
    word_dict = {}
    # For each word, that we want in the dictionary, add it, then make it
    # the value of the prior dictionary length
    for word, word_count in count:
        word_dict[word] = len(word_dict)

    return(word_dict)


# Turn text data into lists of integers from dictionary

data_helpers.py 文件源码项目：tensorflow-deep-qa 作者: shuishen112 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def load_text_vec(alphabet,filename="",embedding_size = 100):
    vectors = {}
    with open(filename) as f:
        i=0
        for line in f:
            i+=1
            if i % 100000 == 0:
                    print 'epch %d' % i
            items = line.strip().split(' ')
            if len(items) == 2:
                vocab_size, embedding_size= items[0],items[1]
                print ( vocab_size, embedding_size)
            else:
                word = items[0]
                if word in alphabet:
                    vectors[word] = items[1:]
    print 'embedding_size',embedding_size
    print 'done'
    print 'words found in wor2vec embedding ',len(vectors.keys())
    return vectors

Assistant.py 文件源码项目：Personal_AI_Assistant 作者: PratylenClub 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def add_list_of_words_in_w2v_model(self, unknown_words):
        huge_w2v_model_file = open(self.w2v_huge_model_path, "r")
        current_w2v_model_file = open(self.w2v_model_path, "a")
        line = huge_w2v_model_file.readline()
        unknown_words_left = len(unknown_words)
        while line and unknown_words_left:
            word = line.split()[0]
            if word in unknown_words:
                current_w2v_model_file.write(line)
                unknown_words = unknown_words - set([word])
                unknown_words_left -= 1
            line = huge_w2v_model_file.readline()
        for word in list(unknown_words):
            random_position = random(self.w2v_model.vector_size)*2-1
            current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position])))
            print "warning random positions introduced for new words ... in the future this should be solved"
        current_w2v_model_file.close()
        huge_w2v_model_file.close()

test_eol.py 文件源码项目：Personal_AI_Assistant 作者: PratylenClub 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def add_list_of_words_in_w2v_model(self, unknown_words):
        huge_w2v_model_file = open(self.w2v_huge_model_path, "r")
        current_w2v_model_file = open(self.w2v_model_path, "a")
        line = huge_w2v_model_file.readline()
        unknown_words_left = len(unknown_words)
        while line and unknown_words_left:
            word = line.split()[0]
            if word in unknown_words:
                current_w2v_model_file.write(line)
                unknown_words = unknown_words - set([word])
                unknown_words_left -= 1
            line = huge_w2v_model_file.readline()
        for word in list(unknown_words):
            random_position = random(self.w2v_model.vector_size)*2-1
            current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position])))
            print "warning random positions introduced for new words ... in the future this should be solved"
        current_w2v_model_file.close()
        huge_w2v_model_file.close()

chunk_sents.py 文件源码项目：refer-parser2 作者: lichengunc 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def extract_NPs(chunk):
    """
    Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
    we extract the NPs with stopping and location words filtered out, and return list of noun phrases.
    """
    forbid_wds = stop_words + location_words
    NPs = []
    for phrase, ptype in chunk:
        if ptype == 'NP':
            filtered_wds = []
            for wd in phrase.split():
                if wd not in forbid_wds:
                    filtered_wds += [wd]
            if len(' '.join(filtered_wds)) > 0:
                NPs += [' '.join(filtered_wds)]
    return NPs

chunk_sents.py 文件源码项目：refer-parser2 作者: lichengunc 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def extract_NNs(chunk, pos):
    """
    Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
    and pos [(word, pos)], e.g., [('man', 'NN')]
    we extract from NPs with stopping, location, color, size words filtered out, 
    and return list of NN words only.
    """
    forbid_wds = stop_words + location_words + color_words + size_words
    NNs = []
    for phrase, ptype in chunk:
        if ptype == 'NP':
            filtered_wds = []
            for wd in phrase.split():
                wd_pos = [p[1] for p in pos if p[0] == wd][0]
                if wd not in forbid_wds and wd_pos != 'JJ' and wd_pos != 'CD':  # we don't need JJ nor CD words neither.
                    filtered_wds += [wd]
            if len(' '.join(filtered_wds)) > 0:
                NNs += [' '.join(filtered_wds)]
    return NNs

text_tools.py 文件源码项目：QProb 作者: quant-trade 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def process_text(self, text):
        flags = (UNICODE if sys.version < '3' and type(text) is unicode
                 else 0)
        regexp = self.regexp if self.regexp is not None else r"\w[\w']+"

        words = findall(regexp, text, flags)
        # remove stopwords
        words = [word for word in words]
        # remove 's
        words = [word[:-2] if word.lower().endswith("'s") else word
                 for word in words]
        # remove numbers
        words = [word for word in words if not word.isdigit()]

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts

ContentTest.py 文件源码项目：newsrecommender 作者: Newsrecommender 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nl.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems

KaggleWord2VecUtility.py 文件源码项目：word2vec_experiments_kaggle_popcorn 作者: bigsnarfdude 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)

    # Define a function to split a review into parsed sentences

KaggleWord2VecUtility.py 文件源码项目：word2vec_experiments_kaggle_popcorn 作者: bigsnarfdude 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences

util.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def extract_unigram_feats(document, unigrams, handle_negation=False):
    """
    Populate a dictionary of unigram features, reflecting the presence/absence in
    the document of each of the tokens in `unigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of words/tokens whose presence/absence has to be
        checked in `document`.
    :param handle_negation: if `handle_negation == True` apply `mark_negation`
        method to `document` before checking for unigram presence/absence.
    :return: a dictionary of unigram features {unigram : boolean}.

    >>> words = ['ice', 'police', 'riot']
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_unigram_feats(document, words).items())
    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
    """
    features = {}
    if handle_negation:
        document = mark_negation(document)
    for word in unigrams:
        features['contains({0})'.format(word)] = word in set(document)
    return features

texttiling.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def __init__(self,
                 w=20,
                 k=10,
                 similarity_method=BLOCK_COMPARISON,
                 stopwords=None,
                 smoothing_method=DEFAULT_SMOOTHING,
                 smoothing_width=2,
                 smoothing_rounds=1,
                 cutoff_policy=HC,
                 demo_mode=False):


        if stopwords is None:
            from nltk.corpus import stopwords
            stopwords = stopwords.words('english')
        self.__dict__.update(locals())
        del self.__dict__['self']

collocations.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size)

collocations.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd)