def get_dlinks(search_target, get_dlinks_only=True):
"""
????url???????
:param search_target: ????
:param get_dlinks_only: ??????
:return ???????????
"""
refer_url = const.REFER_URL % search_target
curl = pycurl.Curl()
curl.setopt(pycurl.USERAGENT, const.USER_AGENT)
curl.setopt(pycurl.REFERER, refer_url)
result = []
ll = 0
record_start_cursor = get_record_start_cursor(const.CURSOR_FILE)
if record_start_cursor:
ll = int(record_start_cursor)
print('start')
# ??????????????
while True:
print('crawler pictures of page %d' % (ll / 30 + 1))
# ??str?????
buffers = StringIO()
target_url = const.API_URL % (search_target, search_target, ll)
curl.setopt(pycurl.URL, target_url)
curl.setopt(pycurl.WRITEDATA, buffers)
curl.perform()
body = buffers.getvalue()
body = body.replace('null', 'None')
data = eval(body)
if 'data' in data:
has_data = False
for a_data in data['data']:
obj_url = None
if 'objURL' in a_data:
obj_url = a_data['objURL']
if obj_url:
has_data = True
result.append(obj_url)
if not has_data:
print('no more pic')
break
ll += 30
else:
print('no more pic')
break
print('done')
curl.close()
# ??page_num
if ll:
set_record_start_cursor(str(ll), const.CURSOR_FILE)
for index, data in enumerate(result):
result[index] = decode_url(data)
if not get_dlinks_only:
save_to_file(result, search_target + '.txt', const.BASE_FOLDER)
评论列表
文章目录