core.py 文件源码

python
阅读 19 收藏 0 点赞 0 评论 0

项目:memex-dossier-open 作者: dossier 项目源码 文件源码
def find_urls_by_selector(self, selector, use_soft=True):
        if not self.conn.indices.exists(index=self.index):
            self.create_index()
        or_query = [{'term': {'url': selector}}]
        for key in self.hard_selectors:
            or_query.append({'term': {key: selector}})
        if use_soft:
            for key in self.soft_selectors:
                or_query.append({'term': {key: selector}})
            logger.debug('including soft_selectors: %r', self.soft_selectors)
        query = {
            "query": {
                "bool": {
                    "should": or_query,
                }
            }
        }
        # logger.debug(json.dumps(query, indent=4, sort_keys=True))
        try:
            res = self.conn.search(
                index=self.index, doc_type=RECORD_TYPE,
                _source_include=[], body=query)
            '''
            body={
                'query': {
                    'multi_match': {
                        'query': selector,
                        'type': 'cross_fields',
                        # TODO: blend soft_selectors into this
                        'fields': self.hard_selectors,
                        }
                    }
                })
            '''
            visited_urls = set()
            for hit in res['hits']['hits']:
                # logger.debug(hit['_score'])
                url = hit['_id']
                if url not in visited_urls:
                    visited_urls.add(url)
                    yield url
        except NotFoundError, exc:
            logger.warn('akagraph indexes do not exist yet: %s', exc)
            return
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号