def find_missing_products():
train = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/train.csv')
train_ids = train['Producto_ID'].unique()
test = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/test.csv')
test_ids = test['Producto_ID'].unique()
missing_ids = pd.Index(test_ids).difference(pd.Index(train_ids))
print "missing ID count ", len(missing_ids)
missing_ids_df = pd.DataFrame(missing_ids, columns=["Producto_ID"])
missing_ids_df.to_csv('missing_ids.csv', index=False)
entries_with_missing = pd.merge(test, missing_ids_df, on='Producto_ID')
print "Mising entries=", entries_with_missing.shape[0], "percentage=", entries_with_missing.shape[0]*100/test.shape[0]
print "full entries count", test.shape[0]
评论列表
文章目录