DouBanMovie.py 文件源码

python
阅读 30 收藏 0 点赞 0 评论 0

项目:DouBanCrawls 作者: SimonCqk 项目源码 文件源码
def movie_spider(movie_tag):
    page_num = 0
    movie_list = list()
    try_times = 0
    while True:
        url = 'https://www.douban.com/tag/' + urllib.request.quote(movie_tag) + '/movie?start=' + str(page_num * 15)
        time.sleep(numpy.random.rand() * 5)  # Hang up the thread to avoid requesting too frequently
        try:
            req = requests.get(url, headers=User_Agents[page_num % len(User_Agents)], timeout=50)
            req.raise_for_status()
            req.encoding = req.apparent_encoding
            source_code = req.text
            plain_text = str(source_code)
        except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
            print(error)
            continue

        soup = BeautifulSoup(plain_text, 'lxml')
        list_soup = soup.find('div', attrs={'class': 'mod movie-list'})
        try_times += 1
        if list_soup == None and try_times < 200:
            continue
        elif list_soup == None or len(list_soup) <= 1:
            break  # No information returned after 200-time requesting

        for movie_info in list_soup.findAll('dd'):
            page_parser(movie_info, movie_list)
            try_times = 0  # set 0 when got valid information
        page_num += 1
        print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, movie_tag))
    print('Finish Catching Tag -> {0}'.format(movie_tag))
    return movie_list
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号