python类Normalizer()的实例源码

model_pipeline.py 文件源码 项目:texta 作者: texta-tk 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_pipeline_builder():

    pipe_builder = PipelineBuilder()

    # Feature Extraction
    params = {'ngram_range': [(1, 1), (1, 2), (1, 3)]}
    pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params)

    params = {}
    pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params)

    params = {}
    pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params)

    # Dimension Reduction
    params = {}
    pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params)

    params = {}
    pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params)

    # Normalization
    params = {}
    pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params)

    params = {}
    pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params)

    # Classification Models
    params = {}
    pipe_builder.add_classifier('MultinomialNB', MultinomialNB, 'Multinomial Naive Bayes', params)

    params = {}
    pipe_builder.add_classifier('BernoulliNB', BernoulliNB, 'Bernoulli Naive Bayes', params)

    params = {}
    pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params)

    params = {}
    pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params)

    return pipe_builder
ClasteringCalculator.py 文件源码 项目:TextStageProcessor 作者: mhyhre 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def make_k_means_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'K_MEANS/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        km = KMeans(n_clusters=self.kmeans_cluster_count, init='k-means++', max_iter=100, n_init=10)
        km.fit(X)

        predict_result = km.predict(X)

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)

        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.signals.PrintInfo.emit('')
        self.signals.PrintInfo.emit('?????? ?????????:')
        for index, cluster_center in enumerate(km.cluster_centers_):
            self.signals.PrintInfo.emit('  ' + str(index) + ':' + str(cluster_center))

        self.draw_clusters_plot(X, predict_result, short_filenames)
ClasteringCalculator.py 文件源码 项目:TextStageProcessor 作者: mhyhre 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def make_dbscan_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'DBSCAN/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        db = DBSCAN(eps=self.dbscan_eps, min_samples=self.dbscan_min_pts)
        predict_result = db.fit_predict(X)
        db.fit(X)

        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'

        clasters_output += ('??????? ???????? (-1):\n')
        for predict, document in zip(predict_result, short_filenames):
            if predict == -1:
                clasters_output += ('  ' + str(document) + '\n')
        clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)

        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
texthasher.py 文件源码 项目:pantip-libr 作者: starcolon 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def new(stop_words=[],decomposition='SVD',n_components=5):

  # Prepare vectoriser engines
  idf = TfidfVectorizer(
    ngram_range=(1,3), #Unigram,bigram,& trigram
    stop_words=stop_words
  )

  # Prepare normaliser
  norm = Normalizer(norm='max')

  print(colored('Texthasher model created','yellow'))

  # Prepare dimensionality reduction
  if decomposition and n_components:
    if decomposition=='LDA': # Results in Non-negative matrix
      reducer = LatentDirichletAllocation( # TFIDF --> Topic term
        n_topics=n_components,
        max_doc_update_iter=20,
        max_iter=8  
      )
      return [idf,norm,reducer]

    elif decomposition=='SVD':
      reducer = TruncatedSVD( # Best for small dataset, 
        n_components,         # nightmare for large dataset
        n_iter=8) # Damn slow

      return [idf,norm,reducer]

    elif decomposition=='PCA':
      # When using IPCA, remember to always keep:
      # n_samples > n_components > batch_size
      # reducer = IncrementalPCA(n_components)

      # Sparse -> Dense greedily consumes large amount of mem
      # to_dense = SparseToDense()

      # return [idf,norm,to_dense,reducer]

      reducer = SparsePCA(n_components)
      return [idf,norm,reducer]

    return [idf,norm]
  else:
    return [idf,norm]
model.py 文件源码 项目:satoshi-mission 作者: lilychai 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def __init__(self, num_class=2):
        """
        :type num_classes: int
        :rtype: None
        """

        self.__ctrl__ = None
        self.__case__ = None

        with open('../../.dbname', 'r') as f:
            self.__DB_NAME__ = json.load(f)['dbname']
        self.__MG_DOCS_COLL__   = 'raw-docs'           # raw docs
        self.__MG_SENTS_COLL__  = 'bag-of-sents'       # raw sentences
        self.__MG_TOKENS_COLL__ = 'sample-tokens'      # clean tokens (words)
        self.__PG_STATS_TBL__   = 'stats'              # stylometric features
        self.__PG_RESULTS_TBL__ = 'results_' + \
                                  str(num_class) + \
                                  'class'              # cross val results
        self.__PG_PROBAS_TBL__  = 'probabilities'      # cross val probabilities


        self.__model__ = Pipeline([ \
                                 # ('scaler2', StandardScaler()),
                                 # ('scaler', MinMaxScaler()),
                                 # ('scaler3', Normalizer()),
                                  ('classifier', SVC(probability=True,
                                                     kernel='poly',
                                                     degree=2,
                                                     class_weight='balanced') \
                                                 if num_class-1 \
                                            else OneClassSVM(kernel='rbf',
                                                             nu=0.7,
                                                             gamma=1./250))
                                 ])

        print 'Instantiated classifier %s.' % \
              self.__model__.named_steps['classifier'].__class__.__name__


        self.__io__ = DBIO(MG_DB_NAME=self.__DB_NAME__,
                           PG_DB_NAME=self.__DB_NAME__)

        self.__tagger__ = None     # initialise if re-creating samples
        self.__bootstrap__ = None  # initialise in fit


问题


面经


文章

微信
公众号

扫码关注公众号