def result(self, url, txt1, txt2):
fn = open(txt1,"r").read()
catalog = fn
fn2 = open(txt2,"r").read()
extract = fn2
yaml = YAML()
import sys
catalog = yaml.load(catalog)
yaml.dump(catalog, sys.stdout)
"""
catalog = {
"default":{
"index": "index|node|/[a-z0-9]+$",
"content": "content|[/_]20[01][0-9][/_-]?\d\d[/_-]?\d\d",
},
"auto.china.com.cn":{
"content": "/20[01][0-9]\d{4}/\d+\.shtml",
},
}
"""
fetcher = Fetcher()
extract_rule = yaml.load(extract)
print (">>>>>",type(extract_rule))
links = fetcher.build(url,'.china.com.cn', catalog, iframe_a=iframe_as.link)
lll = sorted(links.items() , key=lambda t: t[0]+t[1].catalog)
result_list = []
content_List = []
urlNum = 1
contentNum = 1
for k, v in lll:
if True:
result_list.append( (urlNum,v.catalog, v.netloc, v.title, k))
urlNum += 1
if v.netloc in catalog['accepted']:
res = fetcher.detail_page(k, extract_rule)
if len(res) == 7:
content_List.append((contentNum,res["author"],res["editor"],res["title"], k,res["crumbs"],res["date"],res['source'], res['content']))
contentNum += 1
else:
print(res)
self.list_result = result_list
self.content_result = content_List
return result_list,content_List
评论列表
文章目录