Golang code-google-com-p-go-net-html.NewTokenizer模块，常用函数和类-面圈网

作者：vvil 项目：go_test_exampl

func main() {
	s := `<p>Links:<a href="a1" class="test"/></p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`

	doc, _ := html.Parse(strings.NewReader(s))
	traverse_html_node(doc, 0)

	z := html.NewTokenizer(strings.NewReader(s))
	traverse_html_tokenizer(z)

	z1 := html.NewTokenizer(strings.NewReader(s))
	traverse_html_token(z1)
}

作者：bonnefo 项目：gobo

func TokenizePage(r io.Reader) ([]string, string) {
	res := []string{}
	z := html.NewTokenizer(r)
	isTitle := false
	title := ""
loop:
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			break loop
		case html.TextToken:
			text := string(z.Text())
			if isTitle {
				title = cleanTitle(text)
				continue
			}
			res = append(res, bstrings.TokenizeWords(text)...)
		case html.EndTagToken:
			tn, _ := z.TagName()
			if string(tn) == "title" {
				isTitle = false
			}
		case html.StartTagToken:
			tn, _ := z.TagName()
			if string(tn) == "title" {
				isTitle = true
			}
		}
	}
	return res, title
}

作者： 项目：smf-mirro

func FindLinks(body io.Reader) chan link {
	c := make(chan link)

	go func() {
		z := html.NewTokenizer(body)
		for {
			tt := z.Next()
			if tt == html.ErrorToken {
				break
			}
			if tt == html.StartTagToken {
				tn, _ := z.TagName()
				if len(tn) == 1 && tn[0] == 'a' {
					for {
						key, value, more := z.TagAttr()
						// http://stackoverflow.com/questions/14230145/what-is-the-best-way-to-convert-byte-array-to-string
						if string(key) == "href" {
							v := string(value)
							// http://codereview.stackexchange.com/questions/28386/fibonacci-generator-with-golang
							c <- link{v, v}
						}
						if !more {
							break
						}
					}
				}
			}
		}
		c <- link{"", ""}
	}()

	return c
}

作者：pombredann 项目：walker-

// getLinks parses the response for links, doing it's best with bad HTML.
func getLinks(contents []byte) ([]*URL, error) {
	utf8Reader, err := charset.NewReader(bytes.NewReader(contents), "text/html")
	if err != nil {
		return nil, err
	}
	tokenizer := html.NewTokenizer(utf8Reader)

	var links []*URL
	tags := getIncludedTags()

	for {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			//TODO: should use tokenizer.Err() to see if this is io.EOF
			//		(meaning success) or an actual error
			return links, nil
		case html.StartTagToken:

			tagName, hasAttrs := tokenizer.TagName()
			if hasAttrs && tags[string(tagName)] {
				links = parseAnchorAttrs(tokenizer, links)
			}
		}
	}

	return links, nil
}

作者：JamesDunn 项目：go-openi

// Search for
// <head>
//    <meta http-equiv="X-XRDS-Location" content="....">
func findMetaXrdsLocation(input io.Reader) (location string, err error) {
	tokenizer := html.NewTokenizer(input)
	inHead := false
	for {
		tt := tokenizer.Next()
		switch tt {
		case html.ErrorToken:
			return "", tokenizer.Err()
		case html.StartTagToken, html.EndTagToken:
			tk := tokenizer.Token()
			if tk.Data == "head" {
				if tt == html.StartTagToken {
					inHead = true
				} else {
					return "", errors.New("Meta X-XRDS-Location not found")
				}
			} else if inHead && tk.Data == "meta" {
				ok := false
				content := ""
				for _, attr := range tk.Attr {
					if attr.Key == "http-equiv" &&
						attr.Val == "X-XRDS-Location" {
						ok = true
					} else if attr.Key == "content" {
						content = attr.Val
					}
				}
				if ok && len(content) > 0 {
					return content, nil
				}
			}
		}
	}
	return "", errors.New("Meta X-XRDS-Location not found")
}

作者：postfi 项目：spamdefende

func ExtractText(reader io.Reader, remover func(string) (string, error)) (string, error) {
	z := html.NewTokenizer(reader)

	var buf bytes.Buffer
	bodyBlock := false

loop:
	for {
		tokenType := z.Next()
		switch tokenType {
		case html.StartTagToken:
			if z.Token().DataAtom == atom.Body {
				bodyBlock = true
			}
		case html.EndTagToken:
			if z.Token().DataAtom == atom.Body {
				bodyBlock = false
			}
		case html.TextToken:
			if bodyBlock {
				buf.Write(z.Text())
			}
		case html.ErrorToken:
			if z.Err() != io.EOF {
				return "", z.Err()
			}
			break loop
		}
	}

	return remover(buf.String())
}

作者：heartszhan 项目：famou

func html_detect_content_type(head []byte) string {
	reader := bytes.NewReader(head)
	z := html.NewTokenizer(reader)
	expect_html_root := true
FORBEGIN:
	for tt := z.Next(); tt != html.ErrorToken; tt = z.Next() {
		t := z.Token()
		switch {
		case t.Data == "meta" && (tt == html.StartTagToken || tt == html.SelfClosingTagToken):
			if ct, ok := detect_charset_by_token(t.Attr); ok == true {
				return ct
			}
		case t.Data == "head" && tt == html.EndTagToken:
			break
			// un-html file
		case expect_html_root && (tt == html.StartTagToken || tt == html.SelfClosingTagToken):
			if t.Data == "html" {
				expect_html_root = false
			} else {
				break FORBEGIN
			}
		}
	}
	return ""
}

作者：Renzo 项目：gorea

// Returns the href attribute of a <link rel="shortcut icon"> tag or error if not found.
func FindIcon(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoIcon
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Link:
			if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "shortcut icon" && attrs["href"] != "" {
					return attrs["href"], nil
				}
			}
		}
	}
	return "", ErrNoIcon
}

作者：rodsenr 项目：go_exercise

func linkParser(page_chan chan string) <-chan string {
	link_chan := make(chan string)
	go func() {
		for page := range page_chan {
			//page := <-page_chan
			page_bytes := bytes.NewBufferString(page)
			d := html.NewTokenizer(io.Reader(page_bytes))
			for {
				tokenType := d.Next()
				if tokenType == html.ErrorToken {
					fmt.Println("\nFinished to parse page")
					break
				}
				token := d.Token()
				switch tokenType {
				case html.StartTagToken:
					if strings.EqualFold(token.Data, "A") {
						for _, a := range token.Attr {
							if strings.EqualFold(a.Key, "HREF") {
								link_chan <- a.Val
							}
						}
					}
				}
			}
		}
		close(link_chan)
	}()
	return link_chan
}

作者：jimrobinso 项目：xm

func TestPushHTML(t *testing.T) {
	xmlns := NewXmlNamespace()

	for i := range xmlNsSamples {
		j := 0
		z := html.NewTokenizer(strings.NewReader(xhtmlNsSamples[i].sample))
		for {
			tt := z.Next()
			if tt == html.ErrorToken {
				err := z.Err()
				if err == io.EOF {
					err = nil
					break
				}
				t.Fatal(err)
			}
			switch tt {
			case html.StartTagToken, html.SelfClosingTagToken:
				xmlns.PushHTML(z.Token())
				checkState("push", j, xmlns, xhtmlNsSamples[i].prefix[j], xhtmlNsSamples[i].uri[j], t)
				j++
			case html.EndTagToken:
				j--
				checkState("pop", j, xmlns, xhtmlNsSamples[i].prefix[j], xhtmlNsSamples[i].uri[j], t)
				xmlns.Pop()
			}
		}
	}
}

作者：baiju 项目：gorea

func Sanitize(s string) (string, string) {
	r := bytes.NewReader([]byte(s))
	z := html.NewTokenizer(r)
	buf := &bytes.Buffer{}
	snip := &bytes.Buffer{}
	scripts := 0
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return s, snipper(s)
			}
		}
		t := z.Token()
		if t.DataAtom == atom.Script {
			if t.Type == html.StartTagToken {
				scripts++
			} else if t.Type == html.EndTagToken {
				scripts--
			}
		} else if scripts == 0 {
			buf.WriteString(t.String())
			if t.Type == html.TextToken {
				snip.WriteString(t.String())
			}
		}
	}

	return buf.String(), snipper(snip.String())
}

作者：johnvilsac 项目：golang-stuf

func Autodiscover(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	inHtml := false
	inHead := false
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoRssLink
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Html:
			inHtml = !inHtml
		case atom.Head:
			inHead = !inHead
		case atom.Link:
			if inHead && inHtml && (t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken) {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "alternate" && attrs["href"] != "" &&
					(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
					return attrs["href"], nil
				}
			}
		}
	}

	return "", ErrNoRssLink
}

作者：uovob 项目：multige

func GetAllLinks(data io.ReadCloser) (links []string, err error) {
	tokenizer := html.NewTokenizer(data)
	for {
		tokenizer.Next()
		token := tokenizer.Token()
		switch token.Type {
		case html.ErrorToken:
			return
		case html.EndTagToken:
		case html.CommentToken:
		case html.TextToken:
		case html.StartTagToken, html.SelfClosingTagToken:
			if *debug {
				log.Print("type ", token.Type)
				log.Print("data ", token.Data)
			}
			if token.Data == "a" {
				for _, a := range token.Attr {
					if a.Key == "href" {
						for _, ext := range strings.Split(*fileType, ",") {
							if strings.HasSuffix(a.Val, ext) {
								if strings.HasPrefix(a.Val, "//") {
									links = append(links, "http:"+a.Val)
								} else {
									links = append(links, a.Val)
								}
							}
						}
					}
				}
			}
		}
	}
	return
}

作者：robertseato 项目：cree

// Given the HTML of a Goodreads bookshelf, returns the books.
func bookshelfToBooks(body io.ReadCloser) (books []Book) {
	z := html.NewTokenizer(body)

	books = make([]Book, 100)
	for i := 0; i < 1000; {
		book := new(Book)
		tok := z.Next()
		// fmt.Println(tok)
		if tok == html.ErrorToken {
			// ...
			return books
		}
		_, atr, _ := z.TagAttr()
		if strings.Contains(string(atr), "/book/show") {
			_, atr, _ := z.TagAttr()
			book.title = string(string(atr))
			//			fmt.Println("Got book:", book.title)
		} else if strings.Contains(string(atr), "staticStars") {
			_, atr, _ := z.TagAttr()
			book.rating = getRating(string(atr))
		}

		if book.title != "" {
			books[i] = *book
			i++
		}
	}

	return books
}

作者：postfi 项目：spamdefende

func Parse(reader io.Reader) (newPost *post.Post, err error) {

	newPost = &post.Post{}
	currentIdx := 0
	parsers := []post.PartParser{&ReceiverParser{}, &SenderParser{}, &SubjectParser{}, &PostDateParser{}, &ContentParser{}}
	linkParser := &LinkParser{}
	bodyBlock := false

	z := html.NewTokenizer(reader)

loop:
	for {
		tokenType := z.Next()
		switch tokenType {
		case html.StartTagToken:
			tk := z.Token()
			if tk.DataAtom == atom.Body {
				bodyBlock = true
			} else if tk.DataAtom == atom.A {
				for _, attr := range tk.Attr {
					if attr.Key == "href" {
						linkParser.Parse(newPost, []byte(attr.Val))
					}
				}
			}
		case html.EndTagToken:
			if z.Token().DataAtom == atom.Body {
				bodyBlock = false
			}
		case html.TextToken:
			if bodyBlock {
				flow := parsers[currentIdx].Parse(newPost, z.Text())
				switch flow {
				case post.Next:
					if currentIdx < len(parsers) {
						currentIdx += 1
					}
				case post.Error:
					err = parsers[currentIdx].Err()
					break loop
				case post.Stop:
					break loop
				}
			}
		case html.ErrorToken:
			if z.Err() != io.EOF {
				err = z.Err()
			}
			break loop
		}
	}

	if currentIdx != len(parsers)-1 {
		err = errors.New("malformed Post format")
	}

	return
}

作者：JamesDunn 项目：go-openi

func findProviderFromHeadLink(input io.Reader) (opEndpoint, opLocalId string, err error) {
	tokenizer := html.NewTokenizer(input)
	inHead := false
	for {
		tt := tokenizer.Next()
		switch tt {
		case html.ErrorToken:
			// Even if the document is malformed after we found a
			// valid <link> tag, ignore and let's be happy with our
			// openid2.provider and potentially openid2.local_id as well.
			if len(opEndpoint) > 0 {
				return
			}
			return "", "", tokenizer.Err()
		case html.StartTagToken, html.EndTagToken:
			tk := tokenizer.Token()
			if tk.Data == "head" {
				if tt == html.StartTagToken {
					inHead = true
				} else {
					if len(opEndpoint) > 0 {
						return
					}
					return "", "", errors.New(
						"LINK with rel=openid2.provider not found")
				}
			} else if inHead && tk.Data == "link" {
				provider := false
				localId := false
				href := ""
				for _, attr := range tk.Attr {
					if attr.Key == "rel" {
						if attr.Val == "openid2.provider" {
							provider = true
						} else if attr.Val == "openid2.local_id" {
							localId = true
						}
					} else if attr.Key == "href" {
						href = attr.Val
					}
				}
				if provider && !localId && len(href) > 0 {
					opEndpoint = href
				} else if !provider && localId && len(href) > 0 {
					opLocalId = href
				}
			}
		}
	}
	// At this point we should probably have returned either from
	// a closing </head> or a tokenizer error (no </head> found).
	// But just in case.
	if len(opEndpoint) > 0 {
		return
	}
	return "", "", errors.New("LINK rel=openid2.provider not found")
}

作者：sha0code 项目：dirsca

func (c *Crawl) Scan(surl string) {
	//fmt.Printf("scanning %s\n",surl)

	resp := c.R.LaunchNoRead("GET", surl, "")
	if resp == nil || resp.Body == nil {
		//fmt.Println("nil response: "+surl)
		return
	}
	defer resp.Body.Close()

	page := html.NewTokenizer(resp.Body)
	for {
		tokenType := page.Next()
		if tokenType == html.ErrorToken {
			c.Crawled = append(c.Crawled, surl)
			return
		}
		token := page.Token()

		//if tokenType == html.StartTagToken { //&& token.DataAtom.String() == "a" {
		for _, attr := range token.Attr {
			if attr.Key == "href" || attr.Key == "action" || attr.Key == "src" {
				res := c.FixUrl(attr.Val)
				if res != "" && !c.IsRepeated(res) {

					oUrl, err := url.Parse(res)
					if err == nil {

						if oUrl.Host == c.Host {

							var test string

							idx := strings.LastIndex(oUrl.Path, ".")
							if idx >= 0 {
								oUrl.Path = oUrl.Path[0:idx] + "test1337" + oUrl.Path[idx+1:] //TODO: si la url acaba en punto, crashea out of index
								test = oUrl.String()
							} else {
								test = res
							}

							//fmt.Printf("test:%s\n",test)
							_, code_not_found, _ := R.Get(test)
							html, code, _ := R.Get(res)

							if code != code_not_found {
								P.Show("c", code, len(html), res)
								c.Resources = append(c.Resources, res)
								c.NewResources = append(c.NewResources, res)
							}

						}
					}
				}
			}
		}
	}
}

作者：hobinj 项目：licentiou

func main() {
	urls := make([]string, 0, 75)

	resp, err := http.Get("http://opensource.org/licenses/alphabetical")
	if err != nil {
		fmt.Println(err)
		return
	}

	z := html.NewTokenizer(resp.Body)
	for {
		tok := z.Next()
		if tok == html.ErrorToken {
			//fmt.Println("reached error")
			break
		}
		if tok != html.StartTagToken {
			//fmt.Println("not a start tag")
			continue
		}

		tagName, hasAttr := z.TagName()
		if string(tagName) != "a" {
			//fmt.Println(string(tagName), " is not 'a'")
			continue
		}
		if !hasAttr {
			//fmt.Println("tag has no attributes")
			continue
		}

		href := ""

		for {
			attr, val, more := z.TagAttr()
			if string(attr) == "href" {
				//fmt.Println("Found href: ", string(val))
				href = string(val)
			}
			if !more {
				break
			}
		}
		if strings.HasPrefix(href, "/licenses/") {
			href = strings.Replace(href, "/licenses/", "", 1)
			if href == strings.ToLower(href) {
				continue
			}
			urls = append(urls, href)
		}
	}

	for _, license := range urls {
		getLicense(license)
	}
}

作者：anthonyfo 项目：gohtm

// parse parses a stirng and converts it into an html.
func parse(s string) *htmlDocument {
	htmlDoc := &htmlDocument{}
	tokenizer := html.NewTokenizer(strings.NewReader(s))
	for {
		if errorToken, _, _ := parseToken(tokenizer, htmlDoc, nil); errorToken {
			break
		}
	}
	return htmlDoc
}

作者：jimrobinso 项目：xm

func TestXMLBasePushHTML(t *testing.T) {
	for i, v := range xmlBaseTests {
		xmlbase, err := NewXmlBase("")
		if err != nil {
			t.Fatal(i, err)
		}

		if verbose {
			fmt.Println(i, "created", xmlbase.baseUri, xmlbase.depth)
		}

		z := html.NewTokenizer(strings.NewReader(v.example))
		r := 0
		for {
			tt := z.Next()
			switch tt {
			case html.ErrorToken:
				err = z.Err()
				if err == io.EOF {
					return
				}
				t.Fatal(i, err)
			case html.StartTagToken:
				node := z.Token()
				xmlbase.PushHTML(node)
				if verbose {
					fmt.Println(i, "pushed", xmlbase.baseUri, xmlbase.depth)
				}
				for _, attr := range node.Attr {
					if attr.Key == v.resolve[r].html.Key {
						if verbose {
							fmt.Println(i, "verify", attr, v.resolve[r].iri)
						}

						iri, err := xmlbase.Resolve(attr.Val)
						if err != nil {
							t.Fatal(i, r, err)
						}

						if iri != v.resolve[r].iri {
							t.Fatalf("%d %d expected '%s', got '%s'", i, r, v.resolve[r].iri, iri)
						}
						r++
					}
				}

			case html.EndTagToken:
				xmlbase.Pop()
				if verbose {
					fmt.Println(i, "popped", xmlbase.baseUri, xmlbase.depth)
				}
			}
		}
	}
}

Golang code-google-com-p-go-net-html.NewTokenizer类(方法)实例源码