python类Index()的实例源码-面圈网

index.py 文件源码项目：web-search-engine 作者: AnthonySigogne 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def index_job(link) :
    """
    Index a single page.
    """
    print("index page : %s"%link)

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    process = CrawlerProcess({
        'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT':100,
        'REDIRECT_ENABLED':False,
        'SPIDER_MIDDLEWARES' : {
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True
        }
    })
    process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn)
    process.start() # block until finished

ElasticBurp.py 文件源码项目：WASE 作者: thomaspatzke 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def applyConfig(self):
        try:
            print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex))
            self.es = connections.create_connection(hosts=[self.confESHost])
            self.idx = Index(self.confESIndex)
            self.idx.doc_type(DocHTTPRequestResponse)
            if self.idx.exists():
                self.idx.open()
            else:
                self.idx.create()
            self.callbacks.saveExtensionSetting("elasticburp.host", self.confESHost)
            self.callbacks.saveExtensionSetting("elasticburp.index", self.confESIndex)
            self.callbacks.saveExtensionSetting("elasticburp.tools", str(self.confBurpTools))
            self.callbacks.saveExtensionSetting("elasticburp.onlyresp", str(int(self.confBurpOnlyResp)))
        except Exception as e:
            JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)

    ### ITab ###

index.py 文件源码项目：handelsregister 作者: Amsterdam 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def execute(self):

        idx = es.Index(self.index)

        try:
            idx.delete(ignore=404)
            log.info("Deleted index %s", self.index)
        except AttributeError:
            log.warning("Could not delete index '%s', ignoring", self.index)
        except NotFoundError:
            log.warning("Could not delete index '%s', ignoring", self.index)

        # create doc types
        for dt in self.doc_types:
            idx.doc_type(dt)

        # create index
        idx.create()

populate_es.py 文件源码项目：web 作者: pyjobs 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def _synchronise_index(self, sql_table_cls, es_doc_cls, id_logger):
        es_doc = es_doc_cls()

        self._logging(logging.INFO,
                      'Synchronizing %s index.' % es_doc.index)

        with acquire_inter_process_lock('sync_%s' % es_doc.index) as acquired:
            if not acquired:
                es_doc = es_doc_cls()
                err_msg = 'Another process is already synchronizing the %s ' \
                          'index, aborting now.' % es_doc.index
                self._logging(logging.WARNING, err_msg)
            else:
                self._perform_index_sync(sql_table_cls, es_doc_cls, id_logger)

                self._logging(logging.INFO,
                              'Index %s is now synchronized.' % es_doc.index)

populate_es.py 文件源码项目：web 作者: pyjobs 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def _perform_geocomplete_index_population(self, max_doc):
        elasticsearch_conn = connections.get_connection()

        to_index = list()

        for i, document in enumerate(self._geocompletion_documents()):
            if i % max_doc == 0:
                log_msg = 'Computing required geoloc-entry documents.'
                self._logging(logging.INFO, log_msg)

            to_index.append(document.to_dict(True))

            if len(to_index) < max_doc:
                continue

            self._geocomplete_index_batch(elasticsearch_conn, to_index)

            to_index = list()

        if len(to_index) != 0:
            self._geocomplete_index_batch(elasticsearch_conn, to_index)

        elasticsearch_dsl.Index('geocomplete').refresh()

purge_es.py 文件源码项目：web 作者: pyjobs 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def _perform_index_purge(self, index_name, index_settings, doc_type_class):
        log_msg = 'Dropping %s index.' % index_name
        self._logging(logging.INFO, log_msg)

        index = elasticsearch_dsl.Index(index_name)
        index.settings(**index_settings)
        index.doc_type(doc_type_class)

        try:
            index.delete(ignore=404)
            index.create()
        except elasticsearch.exceptions.ElasticsearchException as e:
            log_msg = 'Error while dropping %s index: %s.' % (index_name, e)
            self._logging(logging.ERROR, log_msg)
            return

        log_msg = 'Index %s has been dropped successfully.' % index_name
        self._logging(logging.INFO, log_msg)

index.py 文件源码项目：web-search-engine 作者: AnthonySigogne 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def index():
    """
    URL : /index
    Index a new URL in search engine.
    Method : POST
    Form data :
        - url : the url to index [string, required]
    Return a success message.
    """
    # get POST data
    data = dict((key, request.form.get(key)) for key in request.form.keys())
    if "url" not in data :
        raise InvalidUsage('No url specified in POST data')

    # launch exploration job
    index_job.delay(data["url"])

    return "Indexing started"

zelasticsearch.py 文件源码项目：csirtg-smrt-py 作者: csirtgadgets 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def _create_index(self):
        dt = datetime.utcnow()
        dt = dt.strftime('%Y.%m')
        es = connections.get_connection()
        if not es.indices.exists('indicators-{}'.format(dt)):
            index = Index('indicators-{}'.format(dt))
            index.aliases(live={})
            index.doc_type(Indicator)
            index.create()

            m = Mapping('indicator')
            m.field('indicator_ipv4', 'ip')
            m.field('indicator_ipv4_mask', 'integer')
            m.save('indicators-{}'.format(dt))
        return 'indicators-{}'.format(dt)

WASEProxy.py 文件源码项目：WASE 作者: thomaspatzke 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def mitm_request(self, data):
    # Initialize ES connection and index
    res = connections.create_connection(hosts=[args.elasticsearch])
    idx = Index(args.index)
    idx.doc_type(DocHTTPRequestResponse)
    try:
        DocHTTPRequestResponse.init()
        idx.create()
    except:
        pass

        r = HTTPRequest(data)

        # determine url
        if self.is_connect:
            scheme = "https"
        else:
            scheme = "http"
        url = scheme + "://" + self.hostname
        if scheme == "http" and int(self.port) != 80 or scheme == "https" and int(self.port) != 443:
            url += ":" + str(self.port)
        url += self.path

        if args.verbose:
            print(url)

        self.doc = DocHTTPRequestResponse(host=self.hostname, port=int(self.port), protocol=scheme)
        self.doc.meta.index = args.index
        self.doc.request.url = url
        self.doc.request.requestline = r.requestline
        self.doc.request.method = r.command
        self.doc.host = self.hostname
        self.doc.port = int(self.port)
        self.doc.protocol = scheme

        return data

ElasticBurp.py 文件源码项目：WASE 作者: thomaspatzke 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def createMenuItems(self, invocation):
        menuItems = list()
        selectedMsgs = invocation.getSelectedMessages()
        if selectedMsgs != None and len(selectedMsgs) >= 1:
            menuItems.append(JMenuItem("Add to ElasticSearch Index", actionPerformed=self.genAddToES(selectedMsgs, invocation.getInputEvent().getComponent())))
        return menuItems

tor_elasticsearch.py 文件源码项目：freshonions-torscraper 作者: dirtyfilthy 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def migrate():
    hidden_services = Index('hiddenservices')
    hidden_services.delete(ignore=404)
    hidden_services = Index('hiddenservices')
    hidden_services.doc_type(DomainDocType)
    hidden_services.doc_type(PageDocType)
    hidden_services.settings(number_of_shards=8, number_of_replicas=1)
    hidden_services.create()

test_search.py 文件源码项目：open-ledger 作者: creativecommons 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def tearDown(self):
        index = Index(settings.ELASTICSEARCH_INDEX)
        index.delete(ignore=404)

test_search.py 文件源码项目：open-ledger 作者: creativecommons 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def _index_img(self, img):
        """Index a single img and ensure that it's been propagated to the search engine"""
        image = search.db_image_to_index(img)
        image.save()
        index = Index(name=settings.ELASTICSEARCH_INDEX)
        index.flush(force=True)
        index.refresh()

indexer.py 文件源码项目：open-ledger 作者: creativecommons 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def index_all_images(self, chunk_size=DEFAULT_CHUNK_SIZE, num_iterations=DEFAULT_NUM_ITERATIONS,
                         num_threads=DEFAULT_NUM_THREADS):
        """Index every record in the database with a server-side cursor"""
        index = Index(settings.ELASTICSEARCH_INDEX)
        if not index.exists():
            log.info("Creating new index %s", settings.ELASTICSEARCH_INDEX)
            search.Image.init()
            mapping = search.Image._doc_type.mapping
            mapping.save(settings.ELASTICSEARCH_INDEX)
            log.info("Done creating new index")

        with Pool(num_threads) as pool:
            starts = [i * chunk_size for i in range(0, num_iterations)]
            pool.starmap(do_index, zip(starts, itertools.repeat(chunk_size, len(starts))))

test_aggregations.py 文件源码项目：invenio-stats 作者: inveniosoftware 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def test_aggregation_without_events(app, es_with_templates):
    """Check that the aggregation doesn't crash if there are no events.

    This scenario happens when celery starts aggregating but no events
    have been created yet.
    """
    # Aggregate events
    StatAggregator(event='file-download',
                   aggregation_field='file_id',
                   aggregation_interval='day',
                   query_modifiers=[]).run()
    assert not Index(
        'stats-file-download', using=current_search_client
    ).exists()
    # Create the index but without any event. This happens when the events
    # have been indexed but are not yet searchable (before index refresh).
    Index('events-stats-file-download-2017',
          using=current_search_client).create()
    # Wait for the index to be available
    time.sleep(1)
    # Aggregate events
    StatAggregator(event='file-download',
                   aggregation_field='file_id',
                   aggregation_interval='day',
                   query_modifiers=[]).run()
    assert not Index(
        'stats-file-download', using=current_search_client
    ).exists()

aggregations.py 文件源码项目：invenio-stats 作者: inveniosoftware 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_bookmark(self):
        """Get last aggregation date."""
        if not Index(self.aggregation_alias,
                     using=self.client).exists():
            if not Index(self.event_index,
                         using=self.client).exists():
                return datetime.date.today()
            return self._get_oldest_event_timestamp()

        # retrieve the oldest bookmark
        query_bookmark = Search(
            using=self.client,
            index=self.aggregation_alias,
            doc_type='{0}-bookmark'.format(self.event)
        )[0:1].sort(
            {'date': {'order': 'desc'}}
        )
        bookmarks = query_bookmark.execute()
        # if no bookmark is found but the index exist, the bookmark was somehow
        # lost or never written, so restart from the beginning
        if len(bookmarks) == 0:
            return self._get_oldest_event_timestamp()

        # change it to doc_id_suffix
        bookmark = datetime.datetime.strptime(bookmarks[0].date,
                                              self.doc_id_suffix)
        return bookmark

aggregations.py 文件源码项目：invenio-stats 作者: inveniosoftware 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def run(self):
        """Calculate statistics aggregations."""
        # If no events have been indexed there is nothing to aggregate
        if not Index(self.event_index, using=self.client).exists():
            return
        lower_limit = self.get_bookmark()
        # Stop here if no bookmark could be estimated.
        if lower_limit is None:
            return
        upper_limit = min(
            datetime.datetime.utcnow().
            replace(microsecond=0),
            datetime.datetime.combine(lower_limit +
                                      datetime.timedelta(self.batch_size),
                                      datetime.datetime.min.time())
        )
        while upper_limit <= datetime.datetime.utcnow():
            self.indices = set()
            self.new_bookmark = upper_limit.strftime(self.doc_id_suffix)
            bulk(self.client,
                 self.agg_iter(lower_limit, upper_limit),
                 stats_only=True,
                 chunk_size=50)
            # Flush all indices which have been modified
            current_search_client.indices.flush(
                index=','.join(self.indices),
                wait_if_ongoing=True
            )
            self.set_bookmark()
            self.indices = set()
            lower_limit = lower_limit + datetime.timedelta(self.batch_size)
            upper_limit = min(datetime.datetime.utcnow().
                              replace(microsecond=0),
                              lower_limit +
                              datetime.timedelta(self.batch_size))
            if lower_limit > upper_limit:
                break

index.py 文件源码项目：handelsregister 作者: Amsterdam 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def execute(self):
        """
        Index data of specified queryset
        """
        client = elasticsearch.Elasticsearch(
            hosts=settings.ELASTIC_SEARCH_HOSTS,
            # sniff_on_start=True,
            retry_on_timeout=True,
            refresh=True
        )

        start_time = time.time()
        duration = time.time()
        loop_time = elapsed = duration - start_time

        for batch_i, total_batches, start, end, total, qs in self.batch_qs():

            loop_start = time.time()
            total_left = ((total_batches - batch_i) * loop_time)

            progres_msg = \
                '%s of %s : %8s %8s %8s duration: %.2f left: %.2f' % (
                    batch_i, total_batches, start, end, total, elapsed,
                    total_left
                )

            log.debug(progres_msg)

            helpers.bulk(
                client, (self.convert(obj).to_dict(include_meta=True)
                         for obj in qs),
                raise_on_error=True,
                refresh=True
            )

            now = time.time()
            elapsed = now - start_time
            loop_time = now - loop_start

populate_es.py 文件源码项目：web 作者: pyjobs 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger):
        es_doc = es_doc_cls()

        elasticsearch_conn = connections.get_connection()

        sync_timestamp = current_server_timestamp()

        pending_insertions = self._compute_dirty_documents(
            sql_table_cls, es_doc.doc_type)

        bulk_op = self._synchronisation_op(es_doc, pending_insertions)

        self._logging(logging.INFO, 'Performing synchronization.')

        for ok, info in parallel_bulk(elasticsearch_conn, bulk_op):
            obj_id = info['index']['_id'] \
                if 'index' in info else info['update']['_id']

            if ok:
                # Mark the task as handled so we don't retreat it next time
                self._logging(logging.INFO,
                              'Document %s has been synced successfully.'
                              % obj_id)

                sql_table_cls.update_last_sync(obj_id, sync_timestamp)
            else:
                id_logger(obj_id, logging.ERROR,
                          'Error while syncing document %s index.' % obj_id)

        # Refresh indices to increase research speed
        elasticsearch_dsl.Index(es_doc.index).refresh()

index_builder.py 文件源码项目：pic-data 作者: NYPL 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def create_indices(endpoint):
    """
    Creates constituent and address indices in PIC
    """
    connections.connections.create_connection(hosts=[endpoint], timeout=360, max_retries=10, retry_on_timeout=True)
    pic_index = Index('pic')
    pic_index.doc_type(Constituent)
    pic_index.doc_type(Address)
    pic_index.delete(ignore=404)

    pic_index.settings(
        number_of_shards=5,
        number_of_replicas=2
    )
    pic_index.create()

elastic.py 文件源码项目：nxtool-ng 作者: nbs-system 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self, config_file='config.cfg'):
        super(Elastic, self).__init__()

        self.percentage=10.0
        self.minimum_occurrences=250

# The ConfigParser documentation points out that there's no way to force defaults config option
# outside the "DEFAULT" section.
        config = ConfigParser()
        config.read(config_file)
        if not config.has_section('elastic'):
            config.add_section('elastic')

        for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items():
            if not config.has_option('elastic', option):
                config.set('elastic', option, value)

        self.version = config.getint('elastic', 'version')
        self.index = config.get('elastic', 'index')
        use_ssl = config.getboolean('elastic', 'use_ssl')
        host = config.get('elastic', 'host')
        self.doc_type = config.get('elastic', 'doc_type')
        self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True )

        Event.init(index=self.index)
        index = Index(self.index, using=self.client)
        index.doc_type(Event)
        self.initialize_search()

remap.py 文件源码项目：open-ledger 作者: creativecommons 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def handle(self, *args, **options):
        if options['verbose']:
            log.setLevel(logging.DEBUG)

        es = search.init_es(timeout=2000)

        oldindex = Index(options['oldindex'])
        client = elasticsearch.client.IndicesClient(es)
        # Create the new index
        newindex = Index(options['newindex'])
        newindex.doc_type(search.Image)

        try:
            newindex.create()
        except elasticsearch.exceptions.RequestError as e:
            if options['force']:
                log.warn("Trying to delete previously-created new index %s", options['newindex'])
                newindex.delete()
                newindex.create()
            else:
                raise e
        log.info("Done creating new index %s", options['newindex'])

        log.info("Copying data on %s to %s", options['oldindex'], options['newindex'])

        # Would love to use ES native reindex() but AWS's service doesn't support it :(
        elasticsearch.helpers.reindex(es, options['oldindex'], options['newindex'])


        # Wait for it to be happy
        if not settings.DEBUG:
            es.cluster.health(wait_for_status='green', request_timeout=2000)


        # Is the value of 'oldindex' an alias or a real index?
        if client.exists_alias(name=settings.ELASTICSEARCH_INDEX):
            log.info("Confirmed that value of %s is an alias and not a real index" % options['oldindex'])
            alias_move = """{
                "actions" : [
                    { "remove" : { "index" : "%s", "alias" : "%s" } },
                    { "add" : { "index" : "%s", "alias" : "%s" } }
                ]
            }""" % (options['oldindex'], settings.ELASTICSEARCH_INDEX, options['newindex'], settings.ELASTICSEARCH_INDEX)
            client.update_aliases(alias_move)

        elif client.exists(options['oldindex']):
            log.info("%s is a real index and not an alias, fixing" % options['oldindex'])

            # Delete the old index
            log.info("Deleting %s -- this will cause some downtime", options['oldindex'])
            oldindex.delete()
            client.put_alias(options['newindex'], settings.ELASTICSEARCH_INDEX)

        # Confirm number of documents in current settings
        s = Search()
        response = s.execute()
        log.info("%d results available in %s" % (response.hits.total, settings.ELASTICSEARCH_INDEX))