url?????v1.0.py 文件源码

python
阅读 28 收藏 0 点赞 0 评论 0

项目:Url 作者: beiruan 项目源码 文件源码
def result(self, url, txt1, txt2):
        fn = open(txt1,"r").read()
        catalog = fn
        fn2 = open(txt2,"r").read()
        extract = fn2

        yaml = YAML()
        import sys
        catalog =  yaml.load(catalog)
        yaml.dump(catalog, sys.stdout)   
        """                
        catalog = {
            "default":{
                "index":   "index|node|/[a-z0-9]+$",
                "content": "content|[/_]20[01][0-9][/_-]?\d\d[/_-]?\d\d",
            },
            "auto.china.com.cn":{
                "content": "/20[01][0-9]\d{4}/\d+\.shtml",
            },
        }
        """
        fetcher = Fetcher()
        extract_rule = yaml.load(extract) 
        print (">>>>>",type(extract_rule))
        links = fetcher.build(url,'.china.com.cn', catalog, iframe_a=iframe_as.link)
        lll = sorted(links.items() , key=lambda t: t[0]+t[1].catalog) 
        result_list = []
        content_List = []
        urlNum = 1
        contentNum = 1
        for k, v in lll:
            if True:
                result_list.append( (urlNum,v.catalog, v.netloc, v.title, k))
                urlNum += 1
                if v.netloc in catalog['accepted']:
                    res = fetcher.detail_page(k, extract_rule)
                    if len(res) == 7:
                        content_List.append((contentNum,res["author"],res["editor"],res["title"], k,res["crumbs"],res["date"],res['source'], res['content']))
                        contentNum += 1 
                    else:
                        print(res)     
        self.list_result = result_list
        self.content_result = content_List 
        return result_list,content_List
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号