def __parseHotelComment(self, page_source, hotel_id, comm_type):
response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8")
remarkDom = response.xpath("//div[@class='user_remark_datail']")
remarkDomLen = len(response.xpath("//div[@class='user_remark_datail']/div"))
# ?????????????????????
same_num = 0
for i in range(1, remarkDomLen+1):
id = uuid.uuid1()
# ???
username = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b2']/text()"%i).extract()
username = username[0] if len(username) > 0 else ""
# ????
remarkText = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b2']/p/text()"%i).extract()
remark = ""
for str in remarkText:
remark = remark + re.sub("\s+", "", str)
# ????
comm_time = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b4']/div[@style='float: right;']/text()"%i).extract()[0]
# ????
user_type = ""
senti_value = None
viewpoint = None
try:
user_type = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b3']/text()"%i).extract()[0]
senti_value = self.hotelNLP.sentiment(remark.encode("utf-8"))
viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8"))
except:
traceback.print_exc()
comm = {"guid":id, "username":username, "remark":remark, "comm_time":comm_time, "user_type":user_type, "hotel_id":hotel_id, "comm_type":comm_type, "senti_value":senti_value, "viewpoint":viewpoint}
if self.__is_exist_in_comment_list(comm):
same_num += 1
else:
self.commList.append(comm)
if same_num == remarkDomLen:
return False
else:
return True
评论列表
文章目录