imdb_crawl.py 文件源码-python代码片段

imdb_crawl.py 文件源码
python
阅读 36 收藏 0 点赞 0 评论 0
def _get_reviews_props(movie_code):
    cur_reviews_url = _REVIEWS_URL.format(code=movie_code)
    reviews_page = bs(urllib.request.urlopen(cur_reviews_url), "html.parser")
    reviews = reviews_page.find_all("td", {"class": "comment-summary"})
    user_reviews = []
    for review in reviews:
        try:
            rating = int(re.findall(_USER_REVIEW_RATING_REGEX, str(review))[0])
            date_str = re.findall(
                r"on (\d{1,2} [a-zA-Z]+ \d{4})", str(review))[0]
            date = datetime.strptime(date_str, "%d %B %Y").date()
            contents = review.find_all(
                'a', href=re.compile(r'reviews.+?'))[0].contents[0]
            user = review.find_all(
                'a', href=re.compile(r'/user/.+?'))[1].contents[0]
            user_reviews.append({
                'score': rating, 'review_date': date,
                'contents': contents, 'user': user
            })
        except Exception:  # pylint: disable=W0703
            pass
    return {'imdb_user_reviews': user_reviews}


# ==== crawling a movie profile ====