def find_urls_by_selector(self, selector, use_soft=True):
if not self.conn.indices.exists(index=self.index):
self.create_index()
or_query = [{'term': {'url': selector}}]
for key in self.hard_selectors:
or_query.append({'term': {key: selector}})
if use_soft:
for key in self.soft_selectors:
or_query.append({'term': {key: selector}})
logger.debug('including soft_selectors: %r', self.soft_selectors)
query = {
"query": {
"bool": {
"should": or_query,
}
}
}
# logger.debug(json.dumps(query, indent=4, sort_keys=True))
try:
res = self.conn.search(
index=self.index, doc_type=RECORD_TYPE,
_source_include=[], body=query)
'''
body={
'query': {
'multi_match': {
'query': selector,
'type': 'cross_fields',
# TODO: blend soft_selectors into this
'fields': self.hard_selectors,
}
}
})
'''
visited_urls = set()
for hit in res['hits']['hits']:
# logger.debug(hit['_score'])
url = hit['_id']
if url not in visited_urls:
visited_urls.add(url)
yield url
except NotFoundError, exc:
logger.warn('akagraph indexes do not exist yet: %s', exc)
return
评论列表
文章目录