website_language_extraction.py 文件源码-python代码片段

website_language_extraction.py 文件源码
python
阅读 22 收藏 0 点赞 0 评论 0
def get_website_languages(self,json_data):
        url_language_dictionary = {}

        url_count = 0
        for article in json_data:
            for url in json_data[article]:
                url_count += 1
                # print url_count

                if url in url_language_dictionary:
                    continue

                # start a timeout counter
                signal.alarm(10) 

                try:
                    html = urllib.urlopen(url)

                    encoding = html.headers.getparam('charset')

                    if encoding is None:
                        encoding = chardet.detect(html.read())['encoding']

                    encoded_html = unicode(html.read(),encoding , errors='replace')

                    markup_text =  html2text.html2text(encoded_html)

                    html_from_markup = markdown(markup_text)

                    text = ''.join(BeautifulSoup(html_from_markup,"lxml").findAll(text=True))

                    language = detect(text)

                    url_language_dictionary[url] = language
                except TimeoutException:
                    print "timeout for: " + url
                except Exception as exception:
                    print "Continue after " + exception.__class__.__name__ + " for URL: " + url 
                    continue

        return url_language_dictionary