def __str__(self):
# Return the first few words of the content (with tags stripped)
return Truncator(strip_tags(self.text)).words(10, truncate=' ...')
python类strip_tags()的实例源码
def clean_text(text):
return strip_tags(force_text(text)).replace("\n\n\n", "\n\n").strip()
def item_description(self, item):
content = strip_tags(getattr(item, self.item_description_field))
return content
def __str__(self):
return Truncator(strip_tags(self.body)).words(3, truncate="...")
def _format_value(self, key, value):
if value:
try:
return strip_tags(str(value))
except (TypeError, ValueError):
pass
def item_title(self, item):
""" ????????? ???????? """
return strip_tags(item.text)
def get_info(cls, video_key):
req = request.Request('http://vimeo.com/api/v2/video/%s.xml' % video_key, method='GET')
try:
logger.debug('{0.method} {0.full_url}'.format(req))
response = request.urlopen(req, timeout=3)
except error.URLError:
return None
if response.status != 200:
return None
dom = minidom.parseString(response.read())
title = dom.getElementsByTagName('title').item(0)
description = dom.getElementsByTagName('description').item(0)
description = description.firstChild.data
description = re.sub(r'<br\s*/?>\s*', '\n', description)
thumbnail_large = dom.getElementsByTagName('thumbnail_large').item(0)
width = dom.getElementsByTagName('width').item(0)
width = int(width.firstChild.data)
height = dom.getElementsByTagName('height').item(0)
height = int(height.firstChild.data)
embed_width = min(640, width)
embed_height = int(embed_width * height / width)
code = '<iframe src="//player.vimeo.com/video/{}" ' \
'width="{}" height="{}" frameborder="0" ' \
'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>'
return {
'title': title.firstChild.data,
'description': strip_tags(description),
'preview_url': thumbnail_large.firstChild.data.replace('webp', 'jpg'),
'embed': code.format(video_key, embed_width, embed_height)
}
def clean(html, autoescape=None):
"""
????? ??? ???? ????????: striptags, linebreaksbr, typograf, safe
"""
text = strip_tags(str(html))
text = defaultfilters.linebreaksbr(text, autoescape=autoescape)
text = typograf(text)
return mark_safe(text)
def words_wo_stopwords(text):
"""
Cleans text from stop words.
"""
nltk_stopwords_list = stopwords.words('english')
specifics = load_stop_words(stop_word_file=join(settings.BASE_DIR, "aggregator", 'data', 'stop_words.txt'))
stopwords_list = list(set(nltk_stopwords_list + specifics + ["'s", "n't"]))
words = word_tokenize(strip_tags(text))
cleaned = [w for w in words if not w.lower() in stopwords_list]
text = " ".join(cleaned)
return text
def item_description(self, item):
text = item.summary or ""
return strip_tags(text)
def make_summaries(title):
book = Books.objects.get(title=title)
try:
book.summary = await summarizer(data=strip_tags(book.review), sentences=(settings.SUMMARIZER_SENTENCES+3))
book.save()
print("Amazon summary saved to database: {0}".format(book.summary))
except Exception as e:
print(colored.red("[ERROR] At Amazon summary: {0}".format(e)))
def processs_content(post):
text = words_wo_stopwords(strip_tags(post.content))
#TODO this is duplicated job, should be improved
words = word_tokenize(strip_tags(text))
taggged = pos_tag(words)
cleaned = filter_insignificant(taggged)
text = " ".join(cleaned)
wc = WordCloudMod().generate(text)
result = list(wc.keys())[:10]
if len(result) > 0:
post = await save_tags(tags=result, entry=post)
post.save()
def get_ttachment(post, data):
attachment = {
'name': data['title'],
'link': '{0}{1}/'.format(settings.DOMAIN, post.slug)
}
sentiment = post.sentiment or "N/A"
summary = post.summary or ""
attachment['description'] = strip_tags(summary) + " " + sentiment
if post.image:
attachment['picture'] = '{0}{1}'.format(settings.DOMAIN, post.image)
return attachment
def save(self, *args, **kwargs):
same_slug = True
counter = 1
while same_slug:
same_slug = Post.objects.filter(slug=self.slug)
if same_slug:
self.slug += '_' + str(counter)
counter += 1
else:
break
self.short_content = strip_tags(self.content)[:100] + "..."
super(Post, self).save(*args, **kwargs)
def send_multipart_email(subject, html_template, from_email, to_email):
html = render_to_string(html_template)
text_content = strip_tags(html)
msg = EmailMultiAlternatives(subject, text_content, from_email, to_email)
msg.attach_alternative(html, "text/html")
msg.send()
def html2text(htmltext):
text = HTMLParser().unescape(strip_tags(htmltext))
text = '\n\n'.join(re.split(r'\s*\n\s*\n\s*', text))
text = re.sub('\s\s\s+', ' ', text)
wrapper = textwrap.TextWrapper(
replace_whitespace=False, drop_whitespace=False, width=72)
return '\n'.join(wrapper.wrap(text))
def plain(self):
if not self._plain:
self._plain = strip_tags(self.body)
return self._plain
def __str__(self):
return Truncator(strip_tags(self.text)).words(20)
def raw_text(self):
return strip_tags(self.note_text)
# title to display in list (either self.title or beginning of text if no title)