def get_links_from_url(url):
"""Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made
absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('fetched %s' % url)
html = response.body if isinstance(response.body, str) \
else response.body.decode()
urls = [urljoin(url, remove_fragment(new_url))
for new_url in get_links(html)]
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return([])
raise gen.Return(urls)
python类html()的实例源码
def get_links_from_url(url):
"""Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made
absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('fetched %s' % url)
urls = [urlparse.urljoin(url, remove_fragment(new_url))
for new_url in get_links(response.body)]
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return([])
raise gen.Return(urls)
def get_links_from_url(url): # ????url????
"""Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made
absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url) # ????url,??url???
print('fetched %s' % url)
html = response.body if isinstance(response.body, str) \
else response.body.decode() # ?????????
urls = [urljoin(url, remove_fragment(new_url))
for new_url in get_links(html)] # ???url?????url
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return([]) # Special exception to return a value from a coroutine.
raise gen.Return(urls) # If this exception is raised, its value argument is used as the result of the coroutine.
def get_links(html): # ????????
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self) # ?? ???super.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
print('@@'*20)
print(url_seeker.urls)
print('@@'*20)
return url_seeker.urls # ?????????
def get_links_from_url(url):
"""Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made
absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('fetched %s' % url)
html = response.body if isinstance(response.body, str) \
else response.body.decode()
urls = [urljoin(url, remove_fragment(new_url))
for new_url in get_links(html)]
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return([])
raise gen.Return(urls)
def get_links_from_url(url):
"""Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made
absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('fetched %s' % url)
html = response.body if isinstance(response.body, str) \
else response.body.decode()
urls = [urljoin(url, remove_fragment(new_url))
for new_url in get_links(html)]
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return([])
raise gen.Return(urls)
def get_links_from_url(url):
"""Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made
absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('fetched %s' % url)
html = response.body if isinstance(response.body, str) \
else response.body.decode()
urls = [urljoin(url, remove_fragment(new_url))
for new_url in get_links(html)]
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return([])
raise gen.Return(urls)
def get_links_from_url(url):
"""Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made
absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('fetched %s' % url)
html = response.body if isinstance(response.body, str) \
else response.body.decode()
urls = [urljoin(url, remove_fragment(new_url))
for new_url in get_links(html)]
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return([])
raise gen.Return(urls)
def handle_stream(self, stream, address):
"""
handle telnet connection
http://www.tornadoweb.org/en/stable/gen.html#tornado-gen-simplify-asynchronous-code
"""
stream.write(TELNET_PROMPT_PREFIX)
while True:
try:
command = yield stream.read_until(b'\n')
result = self.handle_command(command.decode().strip())
yield stream.write(result.encode() + TELNET_PROMPT_PREFIX)
except StreamClosedError:
break
def get_links(html):
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
def get_links(html):
class URLSeeker(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
def get_links(html):
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
def get_links(html):
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
def get_links(html):
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
def get_links(html):
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
def fetch_next(self):
"""A Future used with `gen.coroutine`_ to asynchronously retrieve the
next document in the result set, fetching a batch of documents from the
server if necessary. Resolves to ``False`` if there are no more
documents, otherwise :meth:`next_object` is guaranteed to return a
document.
.. _`gen.coroutine`: http://tornadoweb.org/en/stable/gen.html
.. testsetup:: fetch_next
MongoClient().test.test_collection.remove()
collection = MotorClient().test.test_collection
.. doctest:: fetch_next
>>> @gen.coroutine
... def f():
... yield collection.insert([{'_id': i} for i in range(5)])
... cursor = collection.find().sort([('_id', 1)])
... while (yield cursor.fetch_next):
... doc = cursor.next_object()
... sys.stdout.write(str(doc['_id']) + ', ')
... print 'done'
...
>>> IOLoop.current().run_sync(f)
0, 1, 2, 3, 4, done
.. note:: While it appears that fetch_next retrieves each document from
the server individually, the cursor actually fetches documents
efficiently in `large batches`_.
.. _`large batches`: http://docs.mongodb.org/manual/core/read-operations/#cursor-behaviors
"""
future = Future()
if not self._buffer_size() and self.alive:
if self._empty():
# Special case, limit of 0
future.set_result(False)
return future
def cb(batch_size, error):
if error:
future.set_exception(error)
else:
future.set_result(bool(batch_size))
self._get_more(cb)
return future
elif self._buffer_size():
future.set_result(True)
return future
else:
# Dead
future.set_result(False)
return future
def stream_to_handler(self, request_handler):
"""Write the contents of this file to a
:class:`tornado.web.RequestHandler`. This method calls `flush` on
the RequestHandler, so ensure all headers have already been set.
For a more complete example see the implementation of
:class:`~motor.web.GridFSHandler`.
Takes an optional callback, or returns a Future.
:Parameters:
- `callback`: Optional function taking parameters (self, error)
.. code-block:: python
class FileHandler(tornado.web.RequestHandler):
@tornado.web.asynchronous
@gen.coroutine
def get(self, filename):
db = self.settings['db']
fs = yield motor.MotorGridFS(db()).open()
try:
gridout = yield fs.get_last_version(filename)
except gridfs.NoFile:
raise tornado.web.HTTPError(404)
self.set_header("Content-Type", gridout.content_type)
self.set_header("Content-Length", gridout.length)
yield gridout.stream_to_handler(self)
self.finish()
.. seealso:: Tornado `RequestHandler <http://tornadoweb.org/en/stable/web.html#request-handlers>`_
"""
written = 0
while written < self.length:
# Reading chunk_size at a time minimizes buffering
chunk = yield self.read(self.chunk_size)
# write() simply appends the output to a list; flush() sends it
# over the network and minimizes buffering in the handler.
request_handler.write(chunk)
request_handler.flush()
written += len(chunk)
def fetch_next(self):
"""A Future used with `gen.coroutine`_ to asynchronously retrieve the
next document in the result set, fetching a batch of documents from the
server if necessary. Resolves to ``False`` if there are no more
documents, otherwise :meth:`next_object` is guaranteed to return a
document.
.. _`gen.coroutine`: http://tornadoweb.org/en/stable/gen.html
.. testsetup:: fetch_next
MongoClient().test.test_collection.remove()
collection = MotorClient().test.test_collection
.. doctest:: fetch_next
>>> @gen.coroutine
... def f():
... yield collection.insert([{'_id': i} for i in range(5)])
... cursor = collection.find().sort([('_id', 1)])
... while (yield cursor.fetch_next):
... doc = cursor.next_object()
... sys.stdout.write(str(doc['_id']) + ', ')
... print 'done'
...
>>> IOLoop.current().run_sync(f)
0, 1, 2, 3, 4, done
.. note:: While it appears that fetch_next retrieves each document from
the server individually, the cursor actually fetches documents
efficiently in `large batches`_.
.. _`large batches`: http://docs.mongodb.org/manual/core/read-operations/#cursor-behaviors
"""
future = Future()
if not self._buffer_size() and self.alive:
if self._empty():
# Special case, limit of 0
future.set_result(False)
return future
def cb(batch_size, error):
if error:
future.set_exception(error)
else:
future.set_result(bool(batch_size))
self._get_more(cb)
return future
elif self._buffer_size():
future.set_result(True)
return future
else:
# Dead
future.set_result(False)
return future
def stream_to_handler(self, request_handler):
"""Write the contents of this file to a
:class:`tornado.web.RequestHandler`. This method calls `flush` on
the RequestHandler, so ensure all headers have already been set.
For a more complete example see the implementation of
:class:`~motor.web.GridFSHandler`.
Takes an optional callback, or returns a Future.
:Parameters:
- `callback`: Optional function taking parameters (self, error)
.. code-block:: python
class FileHandler(tornado.web.RequestHandler):
@tornado.web.asynchronous
@gen.coroutine
def get(self, filename):
db = self.settings['db']
fs = yield motor.MotorGridFS(db()).open()
try:
gridout = yield fs.get_last_version(filename)
except gridfs.NoFile:
raise tornado.web.HTTPError(404)
self.set_header("Content-Type", gridout.content_type)
self.set_header("Content-Length", gridout.length)
yield gridout.stream_to_handler(self)
self.finish()
.. seealso:: Tornado `RequestHandler <http://tornadoweb.org/en/stable/web.html#request-handlers>`_
"""
written = 0
while written < self.length:
# Reading chunk_size at a time minimizes buffering
chunk = yield self.read(self.chunk_size)
# write() simply appends the output to a list; flush() sends it
# over the network and minimizes buffering in the handler.
request_handler.write(chunk)
request_handler.flush()
written += len(chunk)
def fetch_next(self):
"""A Future used with `gen.coroutine`_ to asynchronously retrieve the
next document in the result set, fetching a batch of documents from the
server if necessary. Resolves to ``False`` if there are no more
documents, otherwise :meth:`next_object` is guaranteed to return a
document.
.. _`gen.coroutine`: http://tornadoweb.org/en/stable/gen.html
.. testsetup:: fetch_next
MongoClient().test.test_collection.remove()
collection = MotorClient().test.test_collection
.. doctest:: fetch_next
>>> @gen.coroutine
... def f():
... yield collection.insert([{'_id': i} for i in range(5)])
... cursor = collection.find().sort([('_id', 1)])
... while (yield cursor.fetch_next):
... doc = cursor.next_object()
... sys.stdout.write(str(doc['_id']) + ', ')
... print 'done'
...
>>> IOLoop.current().run_sync(f)
0, 1, 2, 3, 4, done
.. note:: While it appears that fetch_next retrieves each document from
the server individually, the cursor actually fetches documents
efficiently in `large batches`_.
.. _`large batches`: http://docs.mongodb.org/manual/core/read-operations/#cursor-behaviors
"""
future = Future()
if not self._buffer_size() and self.alive:
if self._empty():
# Special case, limit of 0
future.set_result(False)
return future
def cb(batch_size, error):
if error:
future.set_exception(error)
else:
future.set_result(bool(batch_size))
self._get_more(cb)
return future
elif self._buffer_size():
future.set_result(True)
return future
else:
# Dead
future.set_result(False)
return future
def fetch_next(self):
"""A Future used with `gen.coroutine`_ to asynchronously retrieve the
next document in the result set, fetching a batch of documents from the
server if necessary. Resolves to ``False`` if there are no more
documents, otherwise :meth:`next_object` is guaranteed to return a
document.
.. _`gen.coroutine`: http://tornadoweb.org/en/stable/gen.html
.. testsetup:: fetch_next
MongoClient().test.test_collection.remove()
collection = MotorClient().test.test_collection
.. doctest:: fetch_next
>>> @gen.coroutine
... def f():
... yield collection.insert([{'_id': i} for i in range(5)])
... cursor = collection.find().sort([('_id', 1)])
... while (yield cursor.fetch_next):
... doc = cursor.next_object()
... sys.stdout.write(str(doc['_id']) + ', ')
... print 'done'
...
>>> IOLoop.current().run_sync(f)
0, 1, 2, 3, 4, done
.. note:: While it appears that fetch_next retrieves each document from
the server individually, the cursor actually fetches documents
efficiently in `large batches`_.
.. _`large batches`: http://docs.mongodb.org/manual/core/read-operations/#cursor-behaviors
"""
future = Future()
if not self._buffer_size() and self.alive:
if self._empty():
# Special case, limit of 0
future.set_result(False)
return future
def cb(batch_size, error):
if error:
future.set_exception(error)
else:
future.set_result(bool(batch_size))
self._get_more(cb)
return future
elif self._buffer_size():
future.set_result(True)
return future
else:
# Dead
future.set_result(False)
return future
def stream_to_handler(self, request_handler):
"""Write the contents of this file to a
:class:`tornado.web.RequestHandler`. This method calls `flush` on
the RequestHandler, so ensure all headers have already been set.
For a more complete example see the implementation of
:class:`~motor.web.GridFSHandler`.
Takes an optional callback, or returns a Future.
:Parameters:
- `callback`: Optional function taking parameters (self, error)
.. code-block:: python
class FileHandler(tornado.web.RequestHandler):
@tornado.web.asynchronous
@gen.coroutine
def get(self, filename):
db = self.settings['db']
fs = yield motor.MotorGridFS(db()).open()
try:
gridout = yield fs.get_last_version(filename)
except gridfs.NoFile:
raise tornado.web.HTTPError(404)
self.set_header("Content-Type", gridout.content_type)
self.set_header("Content-Length", gridout.length)
yield gridout.stream_to_handler(self)
self.finish()
.. seealso:: Tornado `RequestHandler <http://tornadoweb.org/en/stable/web.html#request-handlers>`_
"""
written = 0
while written < self.length:
# Reading chunk_size at a time minimizes buffering
chunk = yield self.read(self.chunk_size)
# write() simply appends the output to a list; flush() sends it
# over the network and minimizes buffering in the handler.
request_handler.write(chunk)
request_handler.flush()
written += len(chunk)