pipelines.py 文件源码-python代码片段

def process_item(self,item,spider):
    m = hashlib.md5()
    m.update(item['url'])
    url_MD5 = m.hexdigest()
    content_simhash = Simhash(self.get_features(item['content'])).value
    language = 'en'
    query_json='{"fields":["url_MD5","content_simhash"],"query":{"filtered":{"filter":{"term":{"url_MD5":"'+url_MD5+'"}}}}}'
    es = Elasticsearch(host='192.168.1.14',port=9200,timeout=1000)
    res = es.search(index="hiddenwebs", body=query_json)
    if res['hits']['total'] == 0:
        es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})    
    else:   
        flag = 0
        for hit in res['hits']['hits']:
            #print content_simhash
            #print hit["fields"]["content_simhash"][0]
            if int(hit["fields"]["content_simhash"][0]) == int(content_simhash):
                log.msg('The similar pages in es %s'%(item['url']),level=log.INFO)
                flag = 1
                es.index(index="hiddenwebs", doc_type="hiddenwebpages", id=hit['_id'], body={"create_time":item['create_time']})
                break
        if flag == 0 :
            es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})