dmoz_spider.py 文件源码-python代码片段

def parse1(self, response):

        selector = Selector(response)
        infoItem = response.meta["item"]
        ID = response.meta["ID"]
        text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract())  # ????????text()
        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1)  # ??
        gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1)  # ??
        place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1)  # ???????????
        signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1)  # ????
        birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1)  # ??
        sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1)  # ???
        marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1)  # ????
        url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1)  # ????

        if nickname:
            infoItem['nickname'] = nickname[0]
        if gender:
            infoItem['gender'] = gender[0]
        if place:
            place = place[0].split(" ")
            infoItem["province"] = place[0]
            if len(place) > 1:
                infoItem["city"] = place[1]
        if signature:
            infoItem["signature"] = signature[0]
        if birthday:
            try:
                birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
                infoItem["birthday"] = birthday - datetime.timedelta(hours=8)
            except Exception:
                pass
        if sexorientation:
            if sexorientation[0] == gender[0]:
                infoItem["sexorientation"] = "gay"
            else:
                infoItem["sexorientation"] = "Heterosexual"
        if marriage:
            infoItem["marriage"] = marriage[0]
        if url:
            infoItem["url"] = url[0]

        infoItem["user_id"] = ID

        yield infoItem