hospitalSpider.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:Daily-code 作者: rui7157 项目源码 文件源码
def parse_item(self, response):
        i = HospitalItem()  #http://www.a-hospital.com/w/%E5%9B%9B%E5%B7%9D%E7%9C%81%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8
        province=urllib.unquote(response.url[len("http://www.a-hospital.com/w/"):])
        for name,content in re.findall(r'<li><b><a href=".*?" title=".*?">(.*?)</a>.*?</b>[\s\S]*?<ul>([\s\S]*?)</ul>[\s\S]*?</li>',response.body):
            i['hospitalName'] = name.decode('utf-8')
            content=content.decode("utf-8")
            hospitalAddress=re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
            hospitalPhoneNumber= re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
            hospitalLevel = re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
            hospitalType=re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
            hospitalFaxNumber=re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
            hospitalEmail= re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
            hospitalWebsite= re.findall(u'<b>????</b>[:|?]<a href="(.*?)" class="external free" rel="nofollow" target="_blank">.*?</a></li>',content)
            if hospitalAddress:
                i["hospitalAddress"]=hospitalAddress[0]
            if hospitalPhoneNumber:
               i['hospitalPhoneNumber']= hospitalPhoneNumber[0]
            if hospitalLevel:
                i['hospitalLevel']=hospitalLevel[0]
            if hospitalType:
                i['hospitalType']=hospitalType[0]
            if hospitalFaxNumber:
                i['hospitalFaxNumber']=hospitalFaxNumber[0]
            if hospitalEmail:
                i['hospitalEmail']=hospitalEmail[0]
            if hospitalWebsite:
                i['hospitalWebsite']=hospitalWebsite[0]
            i['hospitalProvince']=province.decode('utf-8')
            yield i
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号