def __init__(self, *args, **kwargs):
super(score_simple, self).__init__(*args, **kwargs)
f_db = os.path.join(
kwargs['output_data_directory'],
kwargs['term_frequency']['f_db']
)
if not os.path.exists(f_db):
msg = "{} not computed yet, needed for TF methods!"
raise ValueError(msg.format(f_db))
score_config = simple_config.load()["score"]
f_csv = os.path.join(
score_config["output_data_directory"],
score_config["term_document_frequency"]["f_db"],
)
IDF = pd.read_csv(f_csv)
IDF = dict(zip(IDF["word"].values, IDF["count"].values))
self.corpus_N = IDF.pop("__pipeline_document_counter")
# Compute the IDF
for key in IDF:
IDF[key] = np.log(float(self.corpus_N) / (IDF[key] + 1))
self.IDF = IDF
评论列表
文章目录