medium_topstories_crawler.py 文件源码-python代码片段

def get_top_stories():
    current_date = START_DATE
    while current_date <= END_DATE:
        top_stories = TopStories()
        date_string = current_date.strftime("%B-%d-%Y").lower()
        url = "https://medium.com/browse/top/" + date_string
        top_stories.data['date'] = current_date.isoformat()
        top_stories.data['url'] = url

        cj = cookielib.MozillaCookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
        req = urllib2.Request(url)
        req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
                        Chrome/50.0.2661.102 Safari/537.36')
        response = opener.open(req, timeout=10)
        data = response.read()

        stories = []
        story_url = re.findall('<a class="link link--darken" href="(.*?)\?source=top_stories---------[0-9]*-" data-action="open-post"', data)
        num = len(story_url)
        for i in range(num):
            story_data = get_story(story_url[i]).data
            if story_data['success']:
                stories.append(story_data)
            print(i)
        top_stories.data['stories'] = stories

        out = codecs.open("./TopStories/%s.json" % current_date.isoformat(), 'w', 'utf-8')
        out.write(top_stories.getstr())
        out.close()
        print("-----%s obtained" % current_date.isoformat())
        current_date = current_date + datetime.timedelta(days=1)