def extract_features(doc):
html = doc['html'] or ''
if not doc_is_extra_sampled(doc):
try:
html = gzip.decompress(base64.b64decode(html)).decode('utf8')
except Exception:
pass # support not compressed html too
text = html_text.extract_text(html)
try:
lang = langdetect.detect(text)
except LangDetectException:
lang = None
return {
'text': text,
'language': lang,
}
评论列表
文章目录