def _get_content_as_unicode(self, url):
'''
Get remote content as unicode.
We let requests handle the conversion [1] , which will use the
content-type header first or chardet if the header is missing
(requests uses its own embedded chardet version).
As we will be storing and serving the contents as unicode, we actually
replace the original XML encoding declaration with an UTF-8 one.
[1] http://github.com/kennethreitz/requests/blob/63243b1e3b435c7736acf1e51c0f6fa6666d861d/requests/models.py#L811
'''
url = url.replace(' ', '%20')
response = requests.get(url, timeout=10)
content = response.text
# Remove original XML declaration
content = re.sub('<\?xml(.*)\?>', '', content)
# Get rid of the BOM and other rubbish at the beginning of the file
content = re.sub('.*?<', '<', content, 1)
content = content[content.index('<'):]
return content
评论列表
文章目录