def get_website_languages(self,json_data):
url_language_dictionary = {}
url_count = 0
for article in json_data:
for url in json_data[article]:
url_count += 1
# print url_count
if url in url_language_dictionary:
continue
# start a timeout counter
signal.alarm(10)
try:
html = urllib.urlopen(url)
encoding = html.headers.getparam('charset')
if encoding is None:
encoding = chardet.detect(html.read())['encoding']
encoded_html = unicode(html.read(),encoding , errors='replace')
markup_text = html2text.html2text(encoded_html)
html_from_markup = markdown(markup_text)
text = ''.join(BeautifulSoup(html_from_markup,"lxml").findAll(text=True))
language = detect(text)
url_language_dictionary[url] = language
except TimeoutException:
print "timeout for: " + url
except Exception as exception:
print "Continue after " + exception.__class__.__name__ + " for URL: " + url
continue
return url_language_dictionary
评论列表
文章目录