def _get_reviews_props(movie_code):
cur_reviews_url = _REVIEWS_URL.format(code=movie_code)
reviews_page = bs(urllib.request.urlopen(cur_reviews_url), "html.parser")
reviews = reviews_page.find_all("td", {"class": "comment-summary"})
user_reviews = []
for review in reviews:
try:
rating = int(re.findall(_USER_REVIEW_RATING_REGEX, str(review))[0])
date_str = re.findall(
r"on (\d{1,2} [a-zA-Z]+ \d{4})", str(review))[0]
date = datetime.strptime(date_str, "%d %B %Y").date()
contents = review.find_all(
'a', href=re.compile(r'reviews.+?'))[0].contents[0]
user = review.find_all(
'a', href=re.compile(r'/user/.+?'))[1].contents[0]
user_reviews.append({
'score': rating, 'review_date': date,
'contents': contents, 'user': user
})
except Exception: # pylint: disable=W0703
pass
return {'imdb_user_reviews': user_reviews}
# ==== crawling a movie profile ====
评论列表
文章目录