def craw(self, root_url, full_path, name):
'''
:param root_url: ???????url
:param full_path: ???????
:param name: ??????
:return:
'''
# self.urls.add_new_url(root_url)
# while self.urls.has_new_url():
# new_url = self.urls.get_new_url()#?url?????url
new_url = root_url
html = None
try:
html = self.downloader.download_list_ph(new_url, name)
except httplib.IncompleteRead as e:
with open(r'list_error.txt', 'a') as f:
f.write(name.encode('utf-8'))
f.write('\n')
if html == None:
return
wechat_url, html_cont = html
acticle_links = self.parser.parse_list(wechat_url, html_cont)
if acticle_links == None:
return
for link in acticle_links:
html = self.downloader.download_articles_ph(link)
data = self.parser.parse_article(html)#?????
if data == None:
continue
(title, wname, date, content, readNum, praise_num, discuss_content, discuss_praise) = data
# self.urls.add_new_urls(new_urls)
# self.outputer.collect_data(data)
self.outputer.output_mongodb(name, data)
# self.outputer.output_file(full_path, data)
评论列表
文章目录