def __extract_metadata(self, doc, payload):
filename = os.path.basename(doc.path)
headers = {
'Accept': 'application/json',
'Content-Disposition': 'attachment; filename=%s' % url_quote(filename)
}
if doc.meta['Content-Type']:
headers['Content-Type'] = doc.meta['Content-Type']
tika_url = self.config.get(helper.TIKA_META)
connection = self.config[helper.INJECTOR].get_http_connection(tika_url)
payload.seek(0)
connection.request('PUT', '/meta', payload.read(), headers)
payload.seek(0)
response = connection.getresponse()
try:
if response.status >= 400:
logging.error('tika error %d (%s): %s', response.status,
response.reason, doc.path)
return {}
response_data = response.read()
finally:
response.close()
try:
result = json.loads(response_data.decode('utf-8'))
except (ValueError, UnicodeDecodeError):
logging.error('invalid response from tika for %s', doc.path)
result = {}
return result
评论列表
文章目录