def get_info(self):
'''
????????????url???url
Get informations of the comic
return:
comic title,description,cover url,chapters' urls
'''
headers={'use-agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",'Referer':'http://manhua.dmzj.com/tags/s.shtml'}
root='http://manhua.dmzj.com'
r_title=r'<span class="anim_title_text"><a href=".*?"><h1>(.*?)</h1></a></span>'
r_des=r'<meta name=\'description\' content=".*?(??.*?)"/>'#????
r_cover=r'src="(.*?)" id="cover_pic"/></a>'#??url??
r_cb=r'<div class="cartoon_online_border" >([\s\S]*?)<div class="clearfix"></div>'#??border
r_cs=r'<li><a title="(.*?)" href="(.*?)" .*?>.*?</a>'#??????
try:
text=requests.get(self.comic_url,headers=headers).text
except ConnectionError:
traceback.print_exc()
raise ConnectionError
title=re.findall(r_title,text)[0]
cb=re.findall(r_cb,text)[0]
chapter_urls=[(c[0],root+c[1]+'#@page=1') for c in re.findall(r_cs,cb)]
cover_url=re.findall(r_cover,text)[0]
des=re.findall(r_des,text)
return title,des,cover_url,chapter_urls
评论列表
文章目录