TuniuService.py 文件源码-python代码片段

def __parseHotelComment(self, page_source, hotel_id, comm_type):
        response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8")
        remarkDom = response.xpath("//div[@class='user_remark_datail']")
        remarkDomLen = len(response.xpath("//div[@class='user_remark_datail']/div"))
        # ?????????????????????
        same_num = 0
        for i in range(1, remarkDomLen+1):
            id = uuid.uuid1()
            # ???
            username = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b2']/text()"%i).extract()
            username = username[0] if len(username) > 0 else ""
            # ????
            remarkText = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b2']/p/text()"%i).extract()
            remark = ""
            for str in remarkText:
                remark = remark + re.sub("\s+", "", str)
            # ????
            comm_time = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b4']/div[@style='float: right;']/text()"%i).extract()[0]
            # ????
            user_type = ""
            senti_value = None
            viewpoint = None
            try:
                user_type = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b3']/text()"%i).extract()[0]
                senti_value = self.hotelNLP.sentiment(remark.encode("utf-8"))
                viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8"))
            except:
                traceback.print_exc()
            comm = {"guid":id, "username":username, "remark":remark, "comm_time":comm_time, "user_type":user_type, "hotel_id":hotel_id, "comm_type":comm_type, "senti_value":senti_value, "viewpoint":viewpoint}
            if self.__is_exist_in_comment_list(comm):
                same_num += 1
            else:
                self.commList.append(comm)
        if same_num == remarkDomLen:
            return False
        else:
            return True