def movie_spider(movie_tag):
page_num = 0
movie_list = list()
try_times = 0
while True:
url = 'https://www.douban.com/tag/' + urllib.request.quote(movie_tag) + '/movie?start=' + str(page_num * 15)
time.sleep(numpy.random.rand() * 5) # Hang up the thread to avoid requesting too frequently
try:
req = requests.get(url, headers=User_Agents[page_num % len(User_Agents)], timeout=50)
req.raise_for_status()
req.encoding = req.apparent_encoding
source_code = req.text
plain_text = str(source_code)
except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
print(error)
continue
soup = BeautifulSoup(plain_text, 'lxml')
list_soup = soup.find('div', attrs={'class': 'mod movie-list'})
try_times += 1
if list_soup == None and try_times < 200:
continue
elif list_soup == None or len(list_soup) <= 1:
break # No information returned after 200-time requesting
for movie_info in list_soup.findAll('dd'):
page_parser(movie_info, movie_list)
try_times = 0 # set 0 when got valid information
page_num += 1
print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, movie_tag))
print('Finish Catching Tag -> {0}'.format(movie_tag))
return movie_list
评论列表
文章目录