def book_spider(book_tag):
page_num = 0
book_list = list()
try_times = 0
while True:
url = 'https://www.douban.com/tag/' + urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15)
time.sleep(numpy.random.rand() * 5) # Hang up the thread to avoid requesting too frequently
try:
source_code = requests.get(url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text
plain_text = str(source_code)
except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
print(error)
continue
soup = BeautifulSoup(plain_text, 'lxml')
list_soup = soup.find('div', attrs={'class': 'mod book-list'})
try_times += 1
if list_soup == None and try_times < 200:
continue
elif list_soup == None or len(list_soup) <= 1:
break # No information returned after 200-time requesting
for book_info in list_soup.findAll('dd'):
title = book_info.find('a', attrs={'class': 'title'}).string.strip()
desc = book_info.find('div', attrs={'class': 'desc'}).string.strip()
desc_list = desc.split('/')
book_url = book_info.find('a', attrs={'class': 'title'}).get('href')
try:
author_info = '/'.join(desc_list[0:-3])
except:
author_info = ' ??'
try:
pub_info = '/'.join(desc_list[-3:])
except:
pub_info = ' ??'
try:
rating = book_info.find('span', {'class': 'rating_nums'}).string.strip()
except:
rating = '0.0'
book_list.append([title, rating, author_info, pub_info])
try_times = 0 # set 0 when got valid information
page_num += 1
print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag))
print('Finish Catching Tag -> {0}'.format(book_tag))
return book_list
评论列表
文章目录