def parse_item(self, response):
i = HospitalItem() #http://www.a-hospital.com/w/%E5%9B%9B%E5%B7%9D%E7%9C%81%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8
province=urllib.unquote(response.url[len("http://www.a-hospital.com/w/"):])
for name,content in re.findall(r'<li><b><a href=".*?" title=".*?">(.*?)</a>.*?</b>[\s\S]*?<ul>([\s\S]*?)</ul>[\s\S]*?</li>',response.body):
i['hospitalName'] = name.decode('utf-8')
content=content.decode("utf-8")
hospitalAddress=re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
hospitalPhoneNumber= re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
hospitalLevel = re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
hospitalType=re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
hospitalFaxNumber=re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
hospitalEmail= re.findall(u"<b>????</b>[:|?](.*?)</li>",content)
hospitalWebsite= re.findall(u'<b>????</b>[:|?]<a href="(.*?)" class="external free" rel="nofollow" target="_blank">.*?</a></li>',content)
if hospitalAddress:
i["hospitalAddress"]=hospitalAddress[0]
if hospitalPhoneNumber:
i['hospitalPhoneNumber']= hospitalPhoneNumber[0]
if hospitalLevel:
i['hospitalLevel']=hospitalLevel[0]
if hospitalType:
i['hospitalType']=hospitalType[0]
if hospitalFaxNumber:
i['hospitalFaxNumber']=hospitalFaxNumber[0]
if hospitalEmail:
i['hospitalEmail']=hospitalEmail[0]
if hospitalWebsite:
i['hospitalWebsite']=hospitalWebsite[0]
i['hospitalProvince']=province.decode('utf-8')
yield i
评论列表
文章目录