DouBanReading.py 文件源码-python代码片段

def book_spider(book_tag):
    page_num = 0
    book_list = list()
    try_times = 0
    while True:
        url = 'https://www.douban.com/tag/' + urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15)
        time.sleep(numpy.random.rand() * 5)  # Hang up the thread to avoid requesting too frequently
        try:
            source_code = requests.get(url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text
            plain_text = str(source_code)
        except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
            print(error)
            continue

        soup = BeautifulSoup(plain_text, 'lxml')
        list_soup = soup.find('div', attrs={'class': 'mod book-list'})
        try_times += 1
        if list_soup == None and try_times < 200:
            continue
        elif list_soup == None or len(list_soup) <= 1:
            break  # No information returned after 200-time requesting

        for book_info in list_soup.findAll('dd'):
            title = book_info.find('a', attrs={'class': 'title'}).string.strip()
            desc = book_info.find('div', attrs={'class': 'desc'}).string.strip()
            desc_list = desc.split('/')
            book_url = book_info.find('a', attrs={'class': 'title'}).get('href')
            try:
                author_info = '/'.join(desc_list[0:-3])
            except:
                author_info = ' ??'
            try:
                pub_info = '/'.join(desc_list[-3:])
            except:
                pub_info = ' ??'
            try:
                rating = book_info.find('span', {'class': 'rating_nums'}).string.strip()
            except:
                rating = '0.0'

            book_list.append([title, rating, author_info, pub_info])
            try_times = 0  # set 0 when got valid information
        page_num += 1
        print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag))
    print('Finish Catching Tag -> {0}'.format(book_tag))
    return book_list