作者:vvil
项目:go_test_exampl
func main() {
s := `<p>Links:<a href="a1" class="test"/></p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
doc, _ := html.Parse(strings.NewReader(s))
traverse_html_node(doc, 0)
z := html.NewTokenizer(strings.NewReader(s))
traverse_html_tokenizer(z)
z1 := html.NewTokenizer(strings.NewReader(s))
traverse_html_token(z1)
}
作者:bonnefo
项目:gobo
func TokenizePage(r io.Reader) ([]string, string) {
res := []string{}
z := html.NewTokenizer(r)
isTitle := false
title := ""
loop:
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
break loop
case html.TextToken:
text := string(z.Text())
if isTitle {
title = cleanTitle(text)
continue
}
res = append(res, bstrings.TokenizeWords(text)...)
case html.EndTagToken:
tn, _ := z.TagName()
if string(tn) == "title" {
isTitle = false
}
case html.StartTagToken:
tn, _ := z.TagName()
if string(tn) == "title" {
isTitle = true
}
}
}
return res, title
}
作者:
项目:smf-mirro
func FindLinks(body io.Reader) chan link {
c := make(chan link)
go func() {
z := html.NewTokenizer(body)
for {
tt := z.Next()
if tt == html.ErrorToken {
break
}
if tt == html.StartTagToken {
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
for {
key, value, more := z.TagAttr()
// http://stackoverflow.com/questions/14230145/what-is-the-best-way-to-convert-byte-array-to-string
if string(key) == "href" {
v := string(value)
// http://codereview.stackexchange.com/questions/28386/fibonacci-generator-with-golang
c <- link{v, v}
}
if !more {
break
}
}
}
}
}
c <- link{"", ""}
}()
return c
}
作者:pombredann
项目:walker-
// getLinks parses the response for links, doing it's best with bad HTML.
func getLinks(contents []byte) ([]*URL, error) {
utf8Reader, err := charset.NewReader(bytes.NewReader(contents), "text/html")
if err != nil {
return nil, err
}
tokenizer := html.NewTokenizer(utf8Reader)
var links []*URL
tags := getIncludedTags()
for {
tokenType := tokenizer.Next()
switch tokenType {
case html.ErrorToken:
//TODO: should use tokenizer.Err() to see if this is io.EOF
// (meaning success) or an actual error
return links, nil
case html.StartTagToken:
tagName, hasAttrs := tokenizer.TagName()
if hasAttrs && tags[string(tagName)] {
links = parseAnchorAttrs(tokenizer, links)
}
}
}
return links, nil
}
作者:JamesDunn
项目:go-openi
// Search for
// <head>
// <meta http-equiv="X-XRDS-Location" content="....">
func findMetaXrdsLocation(input io.Reader) (location string, err error) {
tokenizer := html.NewTokenizer(input)
inHead := false
for {
tt := tokenizer.Next()
switch tt {
case html.ErrorToken:
return "", tokenizer.Err()
case html.StartTagToken, html.EndTagToken:
tk := tokenizer.Token()
if tk.Data == "head" {
if tt == html.StartTagToken {
inHead = true
} else {
return "", errors.New("Meta X-XRDS-Location not found")
}
} else if inHead && tk.Data == "meta" {
ok := false
content := ""
for _, attr := range tk.Attr {
if attr.Key == "http-equiv" &&
attr.Val == "X-XRDS-Location" {
ok = true
} else if attr.Key == "content" {
content = attr.Val
}
}
if ok && len(content) > 0 {
return content, nil
}
}
}
}
return "", errors.New("Meta X-XRDS-Location not found")
}
作者:postfi
项目:spamdefende
func ExtractText(reader io.Reader, remover func(string) (string, error)) (string, error) {
z := html.NewTokenizer(reader)
var buf bytes.Buffer
bodyBlock := false
loop:
for {
tokenType := z.Next()
switch tokenType {
case html.StartTagToken:
if z.Token().DataAtom == atom.Body {
bodyBlock = true
}
case html.EndTagToken:
if z.Token().DataAtom == atom.Body {
bodyBlock = false
}
case html.TextToken:
if bodyBlock {
buf.Write(z.Text())
}
case html.ErrorToken:
if z.Err() != io.EOF {
return "", z.Err()
}
break loop
}
}
return remover(buf.String())
}
作者:heartszhan
项目:famou
func html_detect_content_type(head []byte) string {
reader := bytes.NewReader(head)
z := html.NewTokenizer(reader)
expect_html_root := true
FORBEGIN:
for tt := z.Next(); tt != html.ErrorToken; tt = z.Next() {
t := z.Token()
switch {
case t.Data == "meta" && (tt == html.StartTagToken || tt == html.SelfClosingTagToken):
if ct, ok := detect_charset_by_token(t.Attr); ok == true {
return ct
}
case t.Data == "head" && tt == html.EndTagToken:
break
// un-html file
case expect_html_root && (tt == html.StartTagToken || tt == html.SelfClosingTagToken):
if t.Data == "html" {
expect_html_root = false
} else {
break FORBEGIN
}
}
}
return ""
}
作者:Renzo
项目:gorea
// Returns the href attribute of a <link rel="shortcut icon"> tag or error if not found.
func FindIcon(b []byte) (string, error) {
r := bytes.NewReader(b)
z := html.NewTokenizer(r)
for {
if z.Next() == html.ErrorToken {
if err := z.Err(); err == io.EOF {
break
} else {
return "", ErrNoIcon
}
}
t := z.Token()
switch t.DataAtom {
case atom.Link:
if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
attrs := make(map[string]string)
for _, a := range t.Attr {
attrs[a.Key] = a.Val
}
if attrs["rel"] == "shortcut icon" && attrs["href"] != "" {
return attrs["href"], nil
}
}
}
}
return "", ErrNoIcon
}
作者:rodsenr
项目:go_exercise
func linkParser(page_chan chan string) <-chan string {
link_chan := make(chan string)
go func() {
for page := range page_chan {
//page := <-page_chan
page_bytes := bytes.NewBufferString(page)
d := html.NewTokenizer(io.Reader(page_bytes))
for {
tokenType := d.Next()
if tokenType == html.ErrorToken {
fmt.Println("\nFinished to parse page")
break
}
token := d.Token()
switch tokenType {
case html.StartTagToken:
if strings.EqualFold(token.Data, "A") {
for _, a := range token.Attr {
if strings.EqualFold(a.Key, "HREF") {
link_chan <- a.Val
}
}
}
}
}
}
close(link_chan)
}()
return link_chan
}
作者:jimrobinso
项目:xm
func TestPushHTML(t *testing.T) {
xmlns := NewXmlNamespace()
for i := range xmlNsSamples {
j := 0
z := html.NewTokenizer(strings.NewReader(xhtmlNsSamples[i].sample))
for {
tt := z.Next()
if tt == html.ErrorToken {
err := z.Err()
if err == io.EOF {
err = nil
break
}
t.Fatal(err)
}
switch tt {
case html.StartTagToken, html.SelfClosingTagToken:
xmlns.PushHTML(z.Token())
checkState("push", j, xmlns, xhtmlNsSamples[i].prefix[j], xhtmlNsSamples[i].uri[j], t)
j++
case html.EndTagToken:
j--
checkState("pop", j, xmlns, xhtmlNsSamples[i].prefix[j], xhtmlNsSamples[i].uri[j], t)
xmlns.Pop()
}
}
}
}
作者:baiju
项目:gorea
func Sanitize(s string) (string, string) {
r := bytes.NewReader([]byte(s))
z := html.NewTokenizer(r)
buf := &bytes.Buffer{}
snip := &bytes.Buffer{}
scripts := 0
for {
if z.Next() == html.ErrorToken {
if err := z.Err(); err == io.EOF {
break
} else {
return s, snipper(s)
}
}
t := z.Token()
if t.DataAtom == atom.Script {
if t.Type == html.StartTagToken {
scripts++
} else if t.Type == html.EndTagToken {
scripts--
}
} else if scripts == 0 {
buf.WriteString(t.String())
if t.Type == html.TextToken {
snip.WriteString(t.String())
}
}
}
return buf.String(), snipper(snip.String())
}
作者:johnvilsac
项目:golang-stuf
func Autodiscover(b []byte) (string, error) {
r := bytes.NewReader(b)
z := html.NewTokenizer(r)
inHtml := false
inHead := false
for {
if z.Next() == html.ErrorToken {
if err := z.Err(); err == io.EOF {
break
} else {
return "", ErrNoRssLink
}
}
t := z.Token()
switch t.DataAtom {
case atom.Html:
inHtml = !inHtml
case atom.Head:
inHead = !inHead
case atom.Link:
if inHead && inHtml && (t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken) {
attrs := make(map[string]string)
for _, a := range t.Attr {
attrs[a.Key] = a.Val
}
if attrs["rel"] == "alternate" && attrs["href"] != "" &&
(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
return attrs["href"], nil
}
}
}
}
return "", ErrNoRssLink
}
作者:uovob
项目:multige
func GetAllLinks(data io.ReadCloser) (links []string, err error) {
tokenizer := html.NewTokenizer(data)
for {
tokenizer.Next()
token := tokenizer.Token()
switch token.Type {
case html.ErrorToken:
return
case html.EndTagToken:
case html.CommentToken:
case html.TextToken:
case html.StartTagToken, html.SelfClosingTagToken:
if *debug {
log.Print("type ", token.Type)
log.Print("data ", token.Data)
}
if token.Data == "a" {
for _, a := range token.Attr {
if a.Key == "href" {
for _, ext := range strings.Split(*fileType, ",") {
if strings.HasSuffix(a.Val, ext) {
if strings.HasPrefix(a.Val, "//") {
links = append(links, "http:"+a.Val)
} else {
links = append(links, a.Val)
}
}
}
}
}
}
}
}
return
}
作者:robertseato
项目:cree
// Given the HTML of a Goodreads bookshelf, returns the books.
func bookshelfToBooks(body io.ReadCloser) (books []Book) {
z := html.NewTokenizer(body)
books = make([]Book, 100)
for i := 0; i < 1000; {
book := new(Book)
tok := z.Next()
// fmt.Println(tok)
if tok == html.ErrorToken {
// ...
return books
}
_, atr, _ := z.TagAttr()
if strings.Contains(string(atr), "/book/show") {
_, atr, _ := z.TagAttr()
book.title = string(string(atr))
// fmt.Println("Got book:", book.title)
} else if strings.Contains(string(atr), "staticStars") {
_, atr, _ := z.TagAttr()
book.rating = getRating(string(atr))
}
if book.title != "" {
books[i] = *book
i++
}
}
return books
}
作者:postfi
项目:spamdefende
func Parse(reader io.Reader) (newPost *post.Post, err error) {
newPost = &post.Post{}
currentIdx := 0
parsers := []post.PartParser{&ReceiverParser{}, &SenderParser{}, &SubjectParser{}, &PostDateParser{}, &ContentParser{}}
linkParser := &LinkParser{}
bodyBlock := false
z := html.NewTokenizer(reader)
loop:
for {
tokenType := z.Next()
switch tokenType {
case html.StartTagToken:
tk := z.Token()
if tk.DataAtom == atom.Body {
bodyBlock = true
} else if tk.DataAtom == atom.A {
for _, attr := range tk.Attr {
if attr.Key == "href" {
linkParser.Parse(newPost, []byte(attr.Val))
}
}
}
case html.EndTagToken:
if z.Token().DataAtom == atom.Body {
bodyBlock = false
}
case html.TextToken:
if bodyBlock {
flow := parsers[currentIdx].Parse(newPost, z.Text())
switch flow {
case post.Next:
if currentIdx < len(parsers) {
currentIdx += 1
}
case post.Error:
err = parsers[currentIdx].Err()
break loop
case post.Stop:
break loop
}
}
case html.ErrorToken:
if z.Err() != io.EOF {
err = z.Err()
}
break loop
}
}
if currentIdx != len(parsers)-1 {
err = errors.New("malformed Post format")
}
return
}
作者:JamesDunn
项目:go-openi
func findProviderFromHeadLink(input io.Reader) (opEndpoint, opLocalId string, err error) {
tokenizer := html.NewTokenizer(input)
inHead := false
for {
tt := tokenizer.Next()
switch tt {
case html.ErrorToken:
// Even if the document is malformed after we found a
// valid <link> tag, ignore and let's be happy with our
// openid2.provider and potentially openid2.local_id as well.
if len(opEndpoint) > 0 {
return
}
return "", "", tokenizer.Err()
case html.StartTagToken, html.EndTagToken:
tk := tokenizer.Token()
if tk.Data == "head" {
if tt == html.StartTagToken {
inHead = true
} else {
if len(opEndpoint) > 0 {
return
}
return "", "", errors.New(
"LINK with rel=openid2.provider not found")
}
} else if inHead && tk.Data == "link" {
provider := false
localId := false
href := ""
for _, attr := range tk.Attr {
if attr.Key == "rel" {
if attr.Val == "openid2.provider" {
provider = true
} else if attr.Val == "openid2.local_id" {
localId = true
}
} else if attr.Key == "href" {
href = attr.Val
}
}
if provider && !localId && len(href) > 0 {
opEndpoint = href
} else if !provider && localId && len(href) > 0 {
opLocalId = href
}
}
}
}
// At this point we should probably have returned either from
// a closing </head> or a tokenizer error (no </head> found).
// But just in case.
if len(opEndpoint) > 0 {
return
}
return "", "", errors.New("LINK rel=openid2.provider not found")
}
作者:sha0code
项目:dirsca
func (c *Crawl) Scan(surl string) {
//fmt.Printf("scanning %s\n",surl)
resp := c.R.LaunchNoRead("GET", surl, "")
if resp == nil || resp.Body == nil {
//fmt.Println("nil response: "+surl)
return
}
defer resp.Body.Close()
page := html.NewTokenizer(resp.Body)
for {
tokenType := page.Next()
if tokenType == html.ErrorToken {
c.Crawled = append(c.Crawled, surl)
return
}
token := page.Token()
//if tokenType == html.StartTagToken { //&& token.DataAtom.String() == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" || attr.Key == "action" || attr.Key == "src" {
res := c.FixUrl(attr.Val)
if res != "" && !c.IsRepeated(res) {
oUrl, err := url.Parse(res)
if err == nil {
if oUrl.Host == c.Host {
var test string
idx := strings.LastIndex(oUrl.Path, ".")
if idx >= 0 {
oUrl.Path = oUrl.Path[0:idx] + "test1337" + oUrl.Path[idx+1:] //TODO: si la url acaba en punto, crashea out of index
test = oUrl.String()
} else {
test = res
}
//fmt.Printf("test:%s\n",test)
_, code_not_found, _ := R.Get(test)
html, code, _ := R.Get(res)
if code != code_not_found {
P.Show("c", code, len(html), res)
c.Resources = append(c.Resources, res)
c.NewResources = append(c.NewResources, res)
}
}
}
}
}
}
}
}
作者:hobinj
项目:licentiou
func main() {
urls := make([]string, 0, 75)
resp, err := http.Get("http://opensource.org/licenses/alphabetical")
if err != nil {
fmt.Println(err)
return
}
z := html.NewTokenizer(resp.Body)
for {
tok := z.Next()
if tok == html.ErrorToken {
//fmt.Println("reached error")
break
}
if tok != html.StartTagToken {
//fmt.Println("not a start tag")
continue
}
tagName, hasAttr := z.TagName()
if string(tagName) != "a" {
//fmt.Println(string(tagName), " is not 'a'")
continue
}
if !hasAttr {
//fmt.Println("tag has no attributes")
continue
}
href := ""
for {
attr, val, more := z.TagAttr()
if string(attr) == "href" {
//fmt.Println("Found href: ", string(val))
href = string(val)
}
if !more {
break
}
}
if strings.HasPrefix(href, "/licenses/") {
href = strings.Replace(href, "/licenses/", "", 1)
if href == strings.ToLower(href) {
continue
}
urls = append(urls, href)
}
}
for _, license := range urls {
getLicense(license)
}
}
作者:anthonyfo
项目:gohtm
// parse parses a stirng and converts it into an html.
func parse(s string) *htmlDocument {
htmlDoc := &htmlDocument{}
tokenizer := html.NewTokenizer(strings.NewReader(s))
for {
if errorToken, _, _ := parseToken(tokenizer, htmlDoc, nil); errorToken {
break
}
}
return htmlDoc
}
作者:jimrobinso
项目:xm
func TestXMLBasePushHTML(t *testing.T) {
for i, v := range xmlBaseTests {
xmlbase, err := NewXmlBase("")
if err != nil {
t.Fatal(i, err)
}
if verbose {
fmt.Println(i, "created", xmlbase.baseUri, xmlbase.depth)
}
z := html.NewTokenizer(strings.NewReader(v.example))
r := 0
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
err = z.Err()
if err == io.EOF {
return
}
t.Fatal(i, err)
case html.StartTagToken:
node := z.Token()
xmlbase.PushHTML(node)
if verbose {
fmt.Println(i, "pushed", xmlbase.baseUri, xmlbase.depth)
}
for _, attr := range node.Attr {
if attr.Key == v.resolve[r].html.Key {
if verbose {
fmt.Println(i, "verify", attr, v.resolve[r].iri)
}
iri, err := xmlbase.Resolve(attr.Val)
if err != nil {
t.Fatal(i, r, err)
}
if iri != v.resolve[r].iri {
t.Fatalf("%d %d expected '%s', got '%s'", i, r, v.resolve[r].iri, iri)
}
r++
}
}
case html.EndTagToken:
xmlbase.Pop()
if verbose {
fmt.Println(i, "popped", xmlbase.baseUri, xmlbase.depth)
}
}
}
}
}