def _train_fp_growth_model(cls, data_store, eco_to_package_topic_dict, min_support_count,
additional_path, fp_num_partition):
sc = SparkContext()
manifest_file_list = data_store.list_files(
prefix=os.path.join(additional_path, gnosis_constants.MANIFEST_FILEPATH))
list_of_topic_list = list()
for manifest_file in manifest_file_list:
eco_to_package_list_json_array = data_store.read_json_file(
manifest_file)
for eco_to_package_list_json in eco_to_package_list_json_array:
ecosystem = eco_to_package_list_json.get(gnosis_constants.MANIFEST_ECOSYSTEM)
list_of_package_list = eco_to_package_list_json.get(
gnosis_constants.MANIFEST_PACKAGE_LIST)
for package_list in list_of_package_list:
package_list_lowercase = [x.lower() for x in package_list]
topic_list = cls.get_topic_list_for_package_list(package_list_lowercase,
ecosystem,
eco_to_package_topic_dict)
list_of_topic_list.append(topic_list)
transactions = sc.parallelize(list_of_topic_list)
transactions.cache()
min_support = float(min_support_count / float(transactions.count()))
model = FPGrowth.train(transactions, minSupport=min_support,
numPartitions=fp_num_partition)
return model
gnosis_ref_arch.py 文件源码
python
阅读 17
收藏 0
点赞 0
评论 0
评论列表
文章目录