python类set()的实例源码-面圈网

business_case_solver.py 文件源码项目：themarketingtechnologist 作者: thomhopmans 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def visualize_results(self):
        # Visualize logistic curve using seaborn
        sns.set(style="darkgrid")
        sns.regplot(x="pageviews_cumsum",
                    y="is_conversion",
                    data=self.df,
                    logistic=True,
                    n_boot=500,
                    y_jitter=.01,
                    scatter_kws={"s": 60})
        sns.set(font_scale=1.3)
        sns.plt.title('Logistic Regression Curve')
        sns.plt.ylabel('Conversion probability')
        sns.plt.xlabel('Cumulative sum of pageviews')
        sns.plt.subplots_adjust(right=0.93, top=0.90, left=0.10, bottom=0.10)
        sns.plt.show()

two_sigma_financial_modelling.py 文件源码项目：PortfolioTimeSeriesAnalysis 作者: MizioAnd 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def outlier_identification(self, model, x_train, y_train):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print('\nOutlier shapes')
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        residuals = np.absolute(y_predicted - y_test_split)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        outliers_mask = residuals >= rmse_pred_vs_actual
        outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
        not_an_outlier = outliers_mask == 0
        # Resample the training set from split, since the set was randomly split
        x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
        y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
        return x_out[not_an_outlier, ], y_out[not_an_outlier, ]

two_sigma_financial_modelling.py 文件源码项目：PortfolioTimeSeriesAnalysis 作者: MizioAnd 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
                                  y_test_split, title_name):
        # Split the training data into an extra set of test
        # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual y')
        plt.ylabel('Predicted y')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

graphics.py 文件源码项目：activity-browser 作者: LCA-ActivityBrowser 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def __init__(self, parent):
        fig = Figure(figsize=(4, 4), dpi=100, tight_layout=True)
        super(DefaultGraph, self).__init__(fig)
        self.setParent(parent)
        sns.set(style="dark")

        for index, s in zip(range(9), np.linspace(0, 3, 10)):
            axes = fig.add_subplot(3, 3, index + 1)
            x, y = np.random.randn(2, 50)
            cmap = sns.cubehelix_palette(start=s, light=1, as_cmap=True)
            sns.kdeplot(x, y, cmap=cmap, shade=True, cut=5, ax=axes)
            axes.set_xlim(-3, 3)
            axes.set_ylim(-3, 3)
            axes.set_xticks([])
            axes.set_yticks([])

        fig.suptitle("Activity Browser", y=0.5, fontsize=30, backgroundcolor=(1, 1, 1, 0.5))

        self.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Expanding)
        self.updateGeometry()

utils.py 文件源码项目：kmeans-service 作者: MAYHEM-Lab 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def plot_correlation_fig(data):
    """
    Creates a correlation heat map for all columns in user data.

    Parameters
    ----------
    data: Pandas DataFrame
        User data file as a Pandas DataFrame

    Returns
    -------
    Matplotlib Figure object.
    """
    sns.set(context='talk', style='white')
    fig = plt.figure()
    sns.heatmap(data.corr(), vmin=-1, vmax=1)
    plt.tight_layout()
    return fig

utils.py 文件源码项目：kmeans-service 作者: MAYHEM-Lab 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def plot_count_fig(tasks):
    """
    Create count plot, as a 2-row x 3-col bar plot of data points for each k in each covar.

    Parameters
    ----------
    tasks: list(dict)

    Returns
    -------
    Matplotlib Figure object.
    """
    sns.set(context='talk', style='whitegrid')
    df = pd.DataFrame(filter_dict_list_by_keys(tasks, ['k', 'covar_type', 'covar_tied']))
    df = df.loc[:, ['k', 'covar_type', 'covar_tied', 'bic', 'aic']]
    df['covar_type'] = [x.capitalize() for x in df['covar_type']]
    df['covar_tied'] = [['Untied', 'Tied'][x] for x in df['covar_tied']]
    f = sns.factorplot(x='k', kind='count', col='covar_type', row='covar_tied', data=df,
                      row_order=['Tied', 'Untied'], col_order=['Full', 'Diag', 'Spher'], legend=True, legend_out=True,
                      palette='Blues_d')
    f.set_titles("{col_name}-{row_name}")
    f.set_xlabels("Num. of Clusters (K)")
    return f.fig

generate_plots.py 文件源码项目：krafters 作者: GianlucaBortoli 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def generateRawPlot(test):

    # set figure size
    plt.figure(figsize=(15, 6))
    handles = []
    # draw plot
    for raw in test:
        label = raw.pop(0)
        xAxis = range(len(raw))
        yAxis = [float(i) for i in raw]
        handle, = plt.plot(xAxis, yAxis, label=label)
        handles.append(handle)
    # put axis labels
    plt.xlabel("operations")
    plt.ylabel("time (s)")
    plt.legend(handles=handles)

generate_plots.py 文件源码项目：krafters 作者: GianlucaBortoli 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def generateMassPlot(test):
    # set figure size
    plt.figure(figsize=(15, 6))
    handles = []
    # draw plot
    for raw in test:
        label = raw.pop(0)
        yAxis = [i / (len(raw)) for i in range(len(raw) + 1)]
        values = sorted([float(i) for i in raw])
        xAxis = [0] + values
        handle, = plt.plot(xAxis, yAxis, label=label)
        handles.append(handle)
    # put axis labels
    plt.xlabel("time (s)")
    plt.ylabel("probability of completion")
    plt.legend(handles=handles)

DataAnalysis.py 文件源码项目：ModelFlow 作者: yuezPrincetechs 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def cor_df(data, cols=None, xticklabels=False, yticklabels=False, close=True):
    '''
    ??: ???????????
    ???: 
    data: ?????dataframe??
    cols: ?????list??????data????
    close: ????????
    ???: 
    cormat: ??????dataframe??
    heatmap: ????fig??
    '''
    if cols is None:
        cols=list(data.columns)
    corrmat = data[cols].corr()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    sns.set(context='paper', font='monospace')
    sns.heatmap(corrmat, vmax=0.8, square=True, ax=ax, xticklabels=xticklabels, yticklabels=yticklabels)
    ax.set_title('Heatmap of Correlation Matrix')
    if close:
        plt.close('all')
    return corrmat, fig


#Distribution

benchmark_spark.py 文件源码项目：implicit 作者: benfred 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def benchmark_spark(ratings, factors, iterations=5):
    conf = (SparkConf()
            .setAppName("implicit_benchmark")
            .setMaster('local[*]')
            .set('spark.driver.memory', '16G')
            )
    context = SparkContext(conf=conf)
    spark = SparkSession(context)

    times = {}
    try:
        ratings = convert_sparse_to_dataframe(spark, context, ratings)

        for rank in factors:
            als = ALS(rank=rank, maxIter=iterations,
                      alpha=1, implicitPrefs=True,
                      userCol="row", itemCol="col", ratingCol="data")
            start = time.time()
            als.fit(ratings)
            elapsed = time.time() - start
            times[rank] = elapsed / iterations
            print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
    finally:
        spark.stop()

    return times

benchmark_als.py 文件源码项目：implicit 作者: benfred 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def generate_speed_graph(data, filename="als_speed.png", keys=['gpu', 'cg2', 'cg3', 'cholesky'],
                         labels=None, colours=None):
    labels = labels or {}
    colours = colours or {}

    seaborn.set()
    fig, ax = plt.subplots()

    factors = data['factors']
    for key in keys:
        ax.plot(factors, data[key],
                color=colours.get(key, COLOURS.get(key)),
                marker='o', markersize=6)

        ax.text(factors[-1] + 5, data[key][-1], labels.get(key, LABELS[key]), fontsize=10)

    ax.set_ylabel("Seconds per Iteration")
    ax.set_xlabel("Factors")
    plt.savefig(filename, bbox_inches='tight', dpi=300)

Content_Based.py 文件源码项目：newsrecommender 作者: Newsrecommender 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def find_n_most_similar_articles(self):
        """
        Find the n most similar articles with the highest similarity score for each  article in the DataFrame.
        :return:
        """
        # Iterate over each article in DataFrame
        for index, row in self.df_article_vectors.iterrows():
            # Get the similarity scores of the current article compared to all other articles
            similarity_scores = self.similarity_score_dict[index]
            # Find the highest similarity scores in the similarity_score_dict until we have found the n most similar.
            for i in range(0, self.n_most_similar):
                # Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min!
                most_similar_article_index = max(similarity_scores, key=similarity_scores.get)
                most_similar_article_score = similarity_scores[most_similar_article_index]
                del similarity_scores[most_similar_article_index]
                # Find corresponding title and set it as most similar article i in DataFrame
                title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8')
                title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score)
                self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)

ContentTest.py 文件源码项目：newsrecommender 作者: Newsrecommender 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nl.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems

house_prices.py 文件源码项目：HousePrices 作者: MizioAnd 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def outlier_prediction(x_train, y_train):
        # Use built-in isolation forest or use predicted vs. actual
        # Compute squared residuals of every point
        # Make a threshold criteria for inclusion

        # The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
        rng = np.random.RandomState(42)
        clf_all_features = IsolationForest(max_samples=100, random_state=rng)
        clf_all_features.fit(x_train)

        # Predict if a particular sample is an outlier using all features for higher dimensional data set.
        y_pred_train = clf_all_features.predict(x_train)

        # Exclude suggested outlier samples for improvement of prediction power/score
        outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train))
        x_train_modified = x_train[outlier_map_out_train, ]
        y_train_modified = y_train[outlier_map_out_train, ]

        return x_train_modified, y_train_modified

house_prices.py 文件源码项目：HousePrices 作者: MizioAnd 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def drop_variable(self, df):
        # if HousePrices._is_one_hot_encoder:
            # Drop all categorical feature helping columns ('Num')
            # Todo: is it defined when importing data set? _feature_names_num
            # for feature_name in HousePrices._feature_names_num:
            #     df = df.drop([feature_name], axis=1)

        # is_with_feature_agglomeration = 0
        # if is_with_feature_agglomeration:
        #     print(df.shape)
        #     df = HousePrices.feature_agglomeration(df)
        #     print(df.shape)

        # df = df.drop(['Fireplaces'], axis=1)
        df = df.drop(['Id'], axis=1)

        if not any(tuple(df.columns == 'SalePrice')):
            # All feature var names occuring in test data is assigned the public varaible df_test_all_feature_var_names.
            self.df_test_all_feature_var_names = df.columns
        return df

house_prices.py 文件源码项目：HousePrices 作者: MizioAnd 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
                                0.3, 0.6, 1],
                        max_iter=50000, cv=10)
        # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
        #                         0.3, 0.6, 1], cv=10)

        lasso.fit(x_train_split, y_train_split)
        y_predicted = lasso.predict(X=x_test_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

house_prices.py 文件源码项目：HousePrices 作者: MizioAnd 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)

        res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
                     early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

        best_nrounds = res.shape[0] - 1
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

dcpg_filter_motifs.py 文件源码项目：deepcpg 作者: cangermueller 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def ranges_to_list(x, start=0, stop=None):
    s = set()
    for xi in x:
        xi = str(xi)
        if xi.find('-') >= 0:
            t = xi.split('-')
            if len(t) != 2:
                raise ValueError('Invalid range!')
            if len(t[0]) == 0:
                t[0] = start
            if len(t[1]) == 0:
                t[1] = stop
            s |= set(range(int(t[0]), int(t[1]) + 1))
        else:
            s.add(int(xi))
    s = sorted(list(s))
    return s

analytics.py 文件源码项目：openai_lab 作者: kengz 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def scoped_mpl_import():
    import matplotlib
    matplotlib.rcParams['backend'] = MPL_BACKEND

    import matplotlib.pyplot as plt
    plt.rcParams['toolbar'] = 'None'  # mute matplotlib toolbar

    import seaborn as sns
    sns.set(style="whitegrid", color_codes=True, font_scale=1.0,
            rc={'lines.linewidth': 1.0,
                'backend': matplotlib.rcParams['backend']})
    palette = sns.color_palette("Blues_d")
    palette.reverse()
    sns.set_palette(palette)

    return (matplotlib, plt, sns)

plotting.py 文件源码项目：PythonPackages 作者: wanhanwan 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def cross_section_cndl(data, factor_name):
    '''???????????????
    ??????????????

    ??
    ------------------------------
    data:DataFrame(index:[Date,IDs],factor1,factor2,...)

    factor_name:str
    '''
    data = data.reset_index()
    sns.set(style='ticks')

    ax = sns.boxplot(x='Date', y=factor_name, data=data, palette='PRGn')
    sns.despine(offset=10, trim=True)

    return ax

# ??2
# ?????, ?????????????

tbs_plot.py 文件源码项目：eezzy 作者: 3Blades 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def factor_plot(dataFrame, factors, prediction, color="Set3"):
    # First, plot the total for each factor. Then, plot the total for each
    # factor for the prediction variable (so in a conversion example, how
    # many people converted, revenue per country, etc.)

    # These refer to the rows and columns of the axis numpy array; not the
    # data itself.

    row = 0
    column = 0
    sns.set(style="whitegrid")
    # TODO: Set the width based on the max number of unique
    # values for the factors.

    plots = plt.subplots(len(factors), 2, figsize=(8,12))
    # It should
    for factor in factors:
        sns.countplot(x=factor, palette="Set3", data=dataFrame,
                      ax=plots[1][row][column])
        # Then print the total for each prediction
        sns.barplot(x=factor, y=prediction, data=dataFrame,
        ax=plots[1][row][column+1])
        row += 1
    plt.tight_layout() # Need this or else plots will crash into each other

swarm.py 文件源码项目：astetik 作者: mikkokotila 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def swarm(data,x,y,xscale='linear',yscale='linear'):

    # set default pretty settings from Seaborn

    sns.set(style="white", palette="muted")
    sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 0.2}) 

    # createthe plot

    g = sns.swarmplot(x=x, y=y, data=data, palette='RdYlGn')

    plt.tick_params(axis='both', which='major', pad=10)

    g.set(xscale=xscale)
    g.set(yscale=yscale)

    # Setting plot limits

    start = data[y].min().min()
    plt.ylim(start,);

    sns.despine()

correlation.py 文件源码项目：astetik 作者: mikkokotila 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def correlation(data,title=''):

    corr = data.corr(method='spearman')
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True

    sns.set(style="white")
    sns.set_context("notebook", font_scale=2, rc={"lines.linewidth": 0.3})

    rcParams['figure.figsize'] = 25, 12
    rcParams['font.family'] = 'Verdana'
    rcParams['figure.dpi'] = 300

    g = sns.heatmap(corr, mask=mask, linewidths=1, cmap="RdYlGn", annot=False)
    g.set_xticklabels(data,rotation=25,ha="right");
    plt.tick_params(axis='both', which='major', pad=15);

kmeans.py 文件源码项目：MLAlgorithms 作者: rushter 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def plot(self, ax=None, holdon=False):
        sns.set(style="white")

        data = self.X

        if ax is None:
            _, ax = plt.subplots()



        for i, index in enumerate(self.clusters):
            point = np.array(data[index]).T
            ax.scatter(*point, c=sns.color_palette("hls", self.K + 1)[i])

        for point in self.centroids:
            ax.scatter(*point, marker='x', linewidths=10)

        if not holdon:
            plt.show()

figure_3.py 文件源码项目：Waskom_PNAS_2017 作者: WagnerLabPapers 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def plot_mds(subjects, experiments, axes):

    for subj, exp, ax in zip(subjects, experiments, axes):

        res_fname = "correlation_analysis/{}_{}_ifs.pkz".format(subj, exp)
        res = moss.load_pkl(res_fname)
        sorter = np.argsort(np.abs(res.prefs))

        x_, y_ = res.mds_coords.T.dot(res.prefs)
        t = np.arctan2(y_, x_)
        rot = [[np.cos(t), np.sin(t)], [-np.sin(t), np.cos(t)]]
        x, y = np.dot(rot, res.mds_coords[sorter].T)

        cmap = get_colormap(exp)

        ax.scatter(x, y, c=res.prefs[sorter],
                   cmap=cmap, vmin=-1.75, vmax=1.75,
                   s=8, linewidth=0)

        ax.set(xlim=(-.9, .9), ylim=(-.9, .9), aspect="equal")
        ax.set_axis_off()

chapter_10.py 文件源码项目：python-machine-learning-book 作者: jeremyn 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def visualize_housing_data(df):
    sns.set(style='whitegrid', context='notebook')
    cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']

    sns.pairplot(df[cols], size=2.5)

    plt.show()

    correlation_matrix = np.corrcoef(df[cols].values.T)
    sns.set(font_scale=1.5)
    heatmap = sns.heatmap(
        correlation_matrix,
        cbar=True,
        annot=True,
        square=True,
        fmt='.2f',
        annot_kws={'size': 15},
        yticklabels=cols,
        xticklabels=cols,
    )

    plt.show()

business_case_solver_without_classes.py 文件源码项目：themarketingtechnologist 作者: thomhopmans 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def visualize_results(df):
    # Visualize logistic curve using seaborn
    sns.set(style="darkgrid")
    sns.regplot(x="pageviews_cumsum",
                y="is_conversion",
                data=df,
                logistic=True,
                n_boot=500,
                y_jitter=.01,
                scatter_kws={"s": 60})
    sns.set(font_scale=1.3)
    sns.plt.title('Logistic Regression Curve')
    sns.plt.ylabel('Conversion probability')
    sns.plt.xlabel('Cumulative sum of pageviews')
    sns.plt.subplots_adjust(right=0.93, top=0.90, left=0.10, bottom=0.10)
    sns.plt.show()


# Run the final program

run.py 文件源码项目：themarketingtechnologist 作者: thomhopmans 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nltk.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems

run.py 文件源码项目：themarketingtechnologist 作者: thomhopmans 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def find_n_most_similar_articles(self):
        """
        Find the n most similar articles with the highest similarity score for each TMT article in the DataFrame.
        :return:
        """
        # Iterate over each article in DataFrame
        for index, row in self.df_article_vectors.iterrows():
            # Get the similarity scores of the current article compared to all other articles
            similarity_scores = self.similarity_score_dict[index]
            # Find the highest similarity scores in the similarity_score_dict until we have found the n most similar.
            for i in range(0, self.n_most_similar):
                # Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min!
                most_similar_article_index = max(similarity_scores, key=similarity_scores.get)
                most_similar_article_score = similarity_scores[most_similar_article_index]
                del similarity_scores[most_similar_article_index]
                # Find corresponding title and set it as most similar article i in DataFrame
                title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8')
                title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score)
                self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)

chart.py 文件源码项目：Penny-Dreadful-Tools 作者: PennyDreadfulMTG 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def image(path, costs):
    ys = ['0', '1', '2', '3', '4', '5', '6', '7+', 'X']
    xs = [costs.get(k, 0) for k in ys]
    sns.set_style('white')
    sns.set(font='Concourse C3', font_scale=3)
    g = sns.barplot(ys, xs, palette=['grey'] * len(ys))
    g.axes.yaxis.set_ticklabels([])
    rects = g.patches
    sns.set(font='Concourse C3', font_scale=2)
    for rect, label in zip(rects, xs):
        if label == 0:
            continue
        height = rect.get_height()
        g.text(rect.get_x() + rect.get_width()/2, height + 0.5, label, ha='center', va='bottom')
    g.margins(y=0, x=0)
    sns.despine(left=True, bottom=True)
    g.get_figure().savefig(path, transparent=True, pad_inches=0, bbox_inches='tight')
    plt.clf() # Clear all data from matplotlib so it does not persist across requests.
    return path