作者:heartszhan
项目:famou
func (this *flowdocument_maker) convert_flowdocument(frag *html.Node) {
if frag.Type == html.TextNode {
return
}
ignore_children := false
switch frag.Data {
case "img":
frag.Type = html.CommentNode
node_clear_children(frag)
frag.Attr = nil
case "a":
frag.Data = "Hyperlink"
frag.Attr = extract_ahref_attr(frag.Attr)
case "article":
frag.Data = "FlowDocument"
// set namespace dont work
frag.Attr = []html.Attribute{html.Attribute{Key: "xmlns", Val: fdocns}}
case "object", "video", "audio", "embed":
frag.Type = html.CommentNode
node_clear_children(frag)
frag.Attr = nil
case "p":
fallthrough
default:
frag.Data = "Paragraph"
frag.Attr = nil
if this.first_paragraph == nil {
this.first_paragraph = frag
}
}
for child := frag.FirstChild; ignore_children == false && child != nil; child = child.NextSibling {
this.convert_flowdocument(child)
}
}
作者:ng
项目:GoOs
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
ps := make([]*goquery.Selection, 0)
if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
ps = append(ps, currentSibling)
return ps
} else {
potentialParagraphs := currentSibling.Find("p")
potentialParagraphs.Each(func(i int, s *goquery.Selection) {
text := s.Text()
if len(text) > 0 {
ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
paragraphScore := ws.stopWordCount
siblingBaselineScore := 0.30
highLinkDensity := this.isHighLinkDensity(s)
score := siblingBaselineScore * baselinescoreSiblingsPara
if score < float64(paragraphScore) && !highLinkDensity {
node := new(html.Node)
node.Type = html.TextNode
node.Data = text
node.DataAtom = atom.P
nodes := make([]*html.Node, 1)
nodes[0] = node
newSelection := new(goquery.Selection)
newSelection.Nodes = nodes
ps = append(ps, newSelection)
}
}
})
}
return ps
}
作者:heartszhan
项目:famou
func node_clear_children(frag *html.Node) {
for child := frag.FirstChild; child != nil; {
next := child.NextSibling
frag.RemoveChild(child)
child = next
}
}
作者:heartszhan
项目:gextrac
func clean_element_before_header(body *html.Node, name string) {
child := body.FirstChild
for child != nil {
if child.Type == html.ElementNode && child.Data != name {
next := child.NextSibling
body.RemoveChild(child)
child = next
} else {
break
}
}
}
作者:heartszhan
项目:gextrac
func remove_decentant(n *html.Node, tag string) {
child := n.FirstChild
for child != nil {
if child.Type == html.ElementNode && child.Data == tag {
next := child.NextSibling
n.RemoveChild(child)
child = next
} else {
remove_decentant(child, tag)
child = child.NextSibling
}
}
}
作者:heartszhan
项目:famou
func html_clean_root(root *html.Node, uribase string) (*html.Node, []*html.Node) {
cleaner := &html_cleaner{}
cleaner.current_url, _ = url.Parse(uribase)
cleaner.html_drop_unprintable(root)
cleaner.remove_head()
var (
h1l = len(cleaner.header1s)
h2l = len(cleaner.header2s)
h3l = len(cleaner.header3s)
h4l = len(cleaner.header4s)
)
alter := false
//文档中如果只有一个h1,通常这个h1所在的div就是文档内容
if h1l == 1 { // only one h1
ab := find_article_via_header_i(cleaner.header1s[0])
alter = cleaner.try_update_article(ab)
if !alter && cleaner.title_similar(cleaner.header1s[0].Data) {
alter = true
cleaner.article = ab
}
}
//如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容
if h1l == 0 && h2l == 1 {
ab := find_article_via_header_i(cleaner.header2s[0])
alter = alter || cleaner.try_update_article(ab)
}
if alter == false && h3l == 1 {
ab := find_article_via_header_i(cleaner.header3s[0])
alter = alter || cleaner.try_update_article(ab)
}
if alter == false && h4l == 1 {
ab := find_article_via_header_i(cleaner.header4s[0])
alter = alter || cleaner.try_update_article(ab)
}
if cleaner.article == nil {
cleaner.article = &html.Node{Type: html.ElementNode,
DataAtom: atom.Body,
Data: "body"}
root.AppendChild(cleaner.article)
}
cleaner.fix_forms() // may alter form to div, so do this before try_catch_phpwind
cleaner.try_catch_phpwnd()
cleaner.clean_body()
cleaner.clean_empty_nodes(cleaner.article)
cleaner.clean_attributes(cleaner.article)
return cleaner.article, cleaner.iframes
}
作者:heartszhan
项目:famou
func node_append_children(src *html.Node, target *html.Node) {
foreach_child(src, func(child *html.Node) {
switch {
case child.Type == html.TextNode:
target.AppendChild(create_text(child.Data))
case child.Data == "a" || node_is_object(child):
// ommit all children elements
a := shallow_clone_element(child)
node_append_children(child, a)
target.AppendChild(a)
default:
node_append_children(child, target)
}
})
}
作者:heartszhan
项目:famou
func trim_small_image(img *html.Node) (drop bool) {
width, height, _ := media_get_dim(img)
if img.Parent == nil {
return
}
if width > 0 && height > 0 && width*height < small_image_t*small_image_t && img.Parent.Data == "a" {
img.Data = "input"
drop = true
} else if width == 1 && height == 1 {
img.Data = "input"
drop = true
}
return
}
作者:heartszhan
项目:famou
// flatten inlines text image a object video audio seq
// n is element-node
// inline node may have div element
func (this *html_cleaner) clean_inline_node(n *html.Node) {
inlines := this.flatten_inline_node(n)
for child := n.FirstChild; child != nil; child = n.FirstChild {
n.RemoveChild(child)
}
for _, inline := range inlines {
p := inline.Parent
if p != nil {
p.RemoveChild(inline) // this.article.RemoveChild(child)
}
n.AppendChild(inline)
}
}
作者:heartszhan
项目:famou
//删除行前后空白
func (this *html_cleaner) trim_empty_spaces_func(n *html.Node, trim func(string) string) {
child := n.FirstChild
for child != nil {
if child.Type == html.TextNode {
child.Data = trim(child.Data)
} else {
this.trim_empty_spaces_func(child, trim)
}
if node_is_not_empty(child) {
break
}
next := child.NextSibling
n.RemoveChild(child)
child = next
}
}
作者:rygorou
项目:wp2bloc
// The splitting process may leave TextNodes with no Data, which we keep
// around to make the data manipulation simpler. This function removes
// them.
func cleanupTree(node *html.Node) {
var next *html.Node
for n := node.FirstChild; n != nil; n = next {
next = n.NextSibling
switch n.Type {
case html.TextNode:
if len(n.Data) == 0 {
node.RemoveChild(n)
}
case html.ElementNode:
cleanupTree(n)
default:
// ignore other node types.
}
}
}
作者:heartszhan
项目:gextrac
func trim_display_none(n *html.Node) {
st := get_attribute(n, "style")
if strings.Contains(st, "display") && (strings.Contains(st, "none")) {
// log.Println("hide-node display:none", n.Data)
n.Data = "input"
}
}
作者:rygorou
项目:wp2bloc
func processTextNode(node *html.Node, tags []openTag) (outTags []openTag, next *html.Node, err error) {
i := 0
for i < len(node.Data) {
r, rsize := utf8.DecodeRuneInString(node.Data[i:])
switch r {
case '[':
size, openClose, tag, rest := parseShortcode(node.Data[i+1:])
if size != 0 {
// looks like we found a shortcode!
if tag == "" { // escape code?
// remove the outer [] and continue
node.Data = node.Data[:i] + rest + node.Data[i+1+size:]
i += len(rest)
} else {
return handleShortcode(node, tags, i, i+1+size, openClose, tag, rest)
}
} else {
i += rsize
}
default:
i += rsize
}
}
// default: no shortcode found
outTags = tags
next = node.NextSibling
err = nil
return
}
作者:heartszhan
项目:gextrac
func try_update_class_attr(b *html.Node, class string) {
if len(class) > 0 {
ca := make([]html.Attribute, len(b.Attr)+1)
copy(ca, b.Attr)
ca[len(b.Attr)] = html.Attribute{Key: "class", Val: class}
b.Attr = ca
}
}
作者:gd
项目:Stou
func addFiles(form uint8, parent *html.Node, files []string) {
for _, file := range files {
node := html.Node{
Type: html.ElementNode,
}
switch form {
case SCRIPT:
node.Data = "script"
node.Attr = []html.Attribute{
html.Attribute{
Key: "src",
Val: file,
},
}
case STYLE:
node.Data = "link"
node.Attr = []html.Attribute{
html.Attribute{
Key: "rel",
Val: "stylesheet",
},
html.Attribute{
Key: "href",
Val: file,
},
}
default:
panic("Type not understood")
}
parent.AppendChild(&node)
}
}
作者:rygorou
项目:wp2bloc
// Splits the html.TextNode "node" into two nodes: one that holds
// Data[:splitBefore], and one that holds Data[splitAfter:]. "node"
// is modified in place to be the first result node; the second node
// is the return value.
func splitTextNode(node *html.Node, splitBefore, splitAfter int) *html.Node {
newNode := &html.Node{
Type: html.TextNode,
Data: node.Data[splitAfter:],
}
node.Data = node.Data[:splitBefore]
node.Parent.InsertBefore(newNode, node.NextSibling)
return newNode
}
作者:heartszhan
项目:gextrac
// text-node
// <a>
// <img> <object> <embed> <video> <audio>
// <ul> <ol> <form> <textarea> <input> will be reserved
func flatten_block_node(b *html.Node, article *html.Node, flatt bool, class string) {
cur_class := cat_class(b, class)
switch {
case b.Data == "form" || b.Data == "inputbox" || b.Data == "textarea":
case flatt && is_unflatten_node(b):
nb := create_element(b.Data)
// try_update_class_attr(nb, cur_class)
flatten_block_node(b, nb, false, class)
article.AppendChild(nb)
case hasInlineNodes(b):
p := create_p(b)
// try_update_class_attr(p, cur_class)
article.AppendChild(p)
default:
foreach_child(b, func(child *html.Node) {
flatten_block_node(child, article, true, cur_class)
})
}
}
作者:rygorou
项目:wp2bloc
func processNode(node *html.Node) (err error) {
var stackTags [16]openTag
tags := stackTags[:0]
n := node.FirstChild
for n != nil {
var next, newParent *html.Node
next = n.NextSibling
if l := len(tags); l != 0 {
newParent = tags[l-1].node
}
switch n.Type {
case html.TextNode:
if tags, next, err = processTextNode(n, tags); err != nil {
return
}
case html.ElementNode:
if err = processNode(n); err != nil {
return
}
default:
// Other node types are just ignored.
}
// reparent the active node if necessary
if newParent != nil {
node.RemoveChild(n)
newParent.AppendChild(n)
}
n = next
}
if len(tags) != 0 {
err = fmt.Errorf("shortcodes still open at end of surrounding HTML tag: %+v", tags)
}
return
}
作者:heartszhan
项目:gextrac
//CleanHtml 清洗掉所有的link/style/css
// 删除/html/head
// 转换所有的tag为小写字母
// 找到body/article节点
// 找到h1节点或者h2节点,根据数目设置body
func (cleaner *HtmlCleaner) CleanHtml(root *html.Node) {
var (
dropping []*html.Node = []*html.Node{}
)
cleaner.clean_unprintable_element(&dropping, root)
for _, drop := range dropping {
p := drop.Parent
p.RemoveChild(drop)
}
if cleaner.head != nil {
cleaner.head.Parent.RemoveChild(cleaner.head)
}
//文档中如果只有一个h1,通常这个h1所在的div就是文档内容
if len(cleaner.header1s) == 1 { // only one h1
ab := find_article_via_header_i(cleaner.header1s[0])
cleaner.try_update_article(ab)
}
//如果文档中只有一个h2,这时又没有h1,h2就是其中的标题,所在的div就是文档内容
if len(cleaner.header1s) == 0 && len(cleaner.header2s) == 1 {
ab := find_article_via_header_i(cleaner.header2s[0])
cleaner.try_update_article(ab)
}
if cleaner.Article == nil {
cleaner.Article = &html.Node{Type: html.ElementNode,
DataAtom: atom.Body,
Data: "body"}
root.AppendChild(cleaner.Article)
}
cleaner.try_catch_phpwnd()
cleaner.fix_forms()
cleaner.clean_body()
cleaner.clean_empty_nodes(cleaner.Article)
cleaner.clean_attributes(cleaner.Article)
}
作者:maxwell
项目:sanitiz
// Remove all attributes on the provided node
// that are not contained within this whitelist
func (w *Whitelist) sanitizeAttributes(n *html.Node) {
attributes := make([]html.Attribute, len(n.Attr))
i := 0
for _, attribute := range n.Attr {
if w.HasAttributeForElement(n.Data, attribute.Key) {
attributes[i] = attribute
i += 1
}
}
n.Attr = attributes[0:i]
}