def draw_links(self,n=1,log_sampling=False):
""" Draw multiple random links. """
urls = []
domain_array = np.array([dmn for dmn in self.domain_links])
domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])])
p = np.array([np.float(c) for c in domain_count])
count_total = p.sum()
if log_sampling: # log-sampling [log(x+1)] to bias lower count domains
p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype)
if count_total > 0:
p = p/p.sum()
cnts = npr.multinomial(n, pvals=p)
if n > 1:
for k in range(cnts.shape[0]):
domain = domain_array[k]
cnt = min(cnts[k],domain_count[k])
for url in random.sample(self.domain_links[domain],cnt):
urls.append(url)
else:
k = int(np.nonzero(cnts)[0])
domain = domain_array[k]
url = random.sample(self.domain_links[domain],1)[0]
urls.append(url)
return urls
isp_data_pollution.py 文件源码
python
阅读 40
收藏 0
点赞 0
评论 0
评论列表
文章目录