def get_top_stories():
current_date = START_DATE
while current_date <= END_DATE:
top_stories = TopStories()
date_string = current_date.strftime("%B-%d-%Y").lower()
url = "https://medium.com/browse/top/" + date_string
top_stories.data['date'] = current_date.isoformat()
top_stories.data['url'] = url
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
req = urllib2.Request(url)
req.add_header("User-agent", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36')
response = opener.open(req, timeout=10)
data = response.read()
stories = []
story_url = re.findall('<a class="link link--darken" href="(.*?)\?source=top_stories---------[0-9]*-" data-action="open-post"', data)
num = len(story_url)
for i in range(num):
story_data = get_story(story_url[i]).data
if story_data['success']:
stories.append(story_data)
print(i)
top_stories.data['stories'] = stories
out = codecs.open("./TopStories/%s.json" % current_date.isoformat(), 'w', 'utf-8')
out.write(top_stories.getstr())
out.close()
print("-----%s obtained" % current_date.isoformat())
current_date = current_date + datetime.timedelta(days=1)
medium_topstories_crawler.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录