Golang code-google-com-p-go-net-html.Parse模块，常用函数和类-面圈网

作者：john-griffi 项目：bcoo

func FetchFullDescription(link string) string {
	res, err := http.Get(link)
	if err != nil {
		log.Fatal(err)
	}
	body, err := ioutil.ReadAll(res.Body)
	res.Body.Close()
	if err != nil {
		log.Fatal(err)
	}
	doc, err := html.Parse(strings.NewReader(string(body)))
	content := ""
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "section" {
			for _, a := range n.Attr {
				if a.Key == "class" && a.Val == "entry-content cf" {
					var buf bytes.Buffer
					html.Render(&buf, n)
					content = buf.String()
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
	return content
}

作者：fjj59689437 项目：blog

func GetEntries(root string, useSummary bool) (entries []*Entry, err error) {
	filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
		if strings.ToLower(filepath.Ext(path)) != ".txt" {
			return nil
		}
		entry, _ := GetEntry(path)
		if entry == nil {
			return nil
		}
		entries = append(entries, entry)
		if useSummary {
			doc, err := html.Parse(strings.NewReader(entry.Body))
			if err == nil {
				if text, err := toText(doc); err == nil {
					if len(text) > 500 {
						text = text[0:500] + "..."
					}
					entry.Body = text
				}
			}
		}
		entry.Id = entry.Filename[len(root):len(entry.Filename)-3] + "html"
		return nil
	})
	return
}

作者：heyL 项目：l

func GetFeedUrl(u string) (string, error) {
	resp, err := http.Get(u)
	if err != nil {
		return "", err
	}

	if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
		return u, nil
	}

	tree, err := html.Parse(resp.Body)
	if err != nil {
		return "", err
	}

	sel := cascadia.MustCompile("link[rel=alternate][type*=xml]")
	alt := sel.MatchFirst(tree)
	if alt == nil {
		return "", errors.New("no feed link found")
	}

	altUrl, found := FindAttr("href", alt.Attr)
	if !found {
		return "", errors.New("missing link in alternate")
	}

	return ToAbsolute(resp.Request.URL, altUrl.Val), nil
}

作者：eaburn 项目：feedm

// FixHtml parses bytes as HTML and returns well-formed HTML if the parse
// was successful, or escaped HTML, if not.
func fixHtml(linkUrl string, wild []byte) (well []byte) {
	n, err := html.Parse(bytes.NewReader(wild))
	if err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	fixImgs(linkUrl, n)

	defer func() {
		if err := recover(); err == bytes.ErrTooLarge {
			well = []byte(html.EscapeString(string(wild)))
		} else if err != nil {
			panic(err)
		}
	}()
	buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2))
	if err := html.Render(buf, n); err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	well = buf.Bytes()
	openBody := []byte("<body>")
	i := bytes.Index(well, openBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	well = well[i+len(openBody):]

	closeBody := []byte("</body>")
	i = bytes.Index(well, closeBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	return well[:i]
}

作者：jShi-gi 项目：goquery_sampl

func TestSelectors(t *testing.T) {
	for _, test := range selectorTests {
		s, err := Compile(test.selector)
		if err != nil {
			t.Errorf("error compiling %q: %s", test.selector, err)
			continue
		}

		doc, err := html.Parse(strings.NewReader(test.HTML))
		if err != nil {
			t.Errorf("error parsing %q: %s", test.HTML, err)
			continue
		}

		matches := s.MatchAll(doc)
		if len(matches) != len(test.results) {
			t.Errorf("wanted %d elements, got %d instead", len(test.results), len(matches))
			continue
		}

		for i, m := range matches {
			got := nodeString(m)
			if got != test.results[i] {
				t.Errorf("wanted %s, got %s instead", test.results[i], got)
			}
		}
	}
}

作者：98p 项目：docke

func ExampleParse() {
	s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
	doc, err := html.Parse(strings.NewReader(s))
	if err != nil {
		log.Fatal(err)
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" {
					fmt.Println(a.Val)
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
	// Output:
	// foo
	// /bar/baz
}

作者：hdonna 项目：dereddi

func parseStub(stub string) (r redditStub, err error) {
	var extract func(*html.Node)
	var doc *html.Node
	doc, err = html.Parse(strings.NewReader(stub))
	if err != nil {
		return
	}
	extract = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			switch {
			case n.FirstChild.Data == "[link]":
				r.Link = n.Attr[0].Val
			case strings.HasSuffix(n.FirstChild.Data, " comments]"):
				r.Comments = n.Attr[0].Val
			case strings.HasPrefix(n.Attr[0].Val, "http://www.reddit.com/user/"):
				r.User = strings.TrimSpace(n.FirstChild.Data)
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			extract(c)
		}
	}
	extract(doc)
	return
}

作者：kyleconro 项目：frantic-searc

func TestFind(t *testing.T) {
	s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a class="goo" href="/bar/baz">BarBaz</a></ul>`
	doc, _ := html.Parse(strings.NewReader(s))

	_, found := Find(doc, "#foo")

	if found {
		t.Errorf("There is no node with id 'foo'")
	}

	p, found := Find(doc, "p")

	if !found || p.Data != "p" {
		t.Errorf("Couldn't find p")
	}

	a, found := Find(doc, "ul a")

	if !found || a.Data != "a" || Flatten(a) != "Foo" {
		t.Errorf("Couldn't find a")
	}

	goo, found := Find(doc, "ul .goo")

	if !found || goo.Data != "a" || Flatten(goo) != "BarBaz" {
		t.Errorf("Couldn't find a with class goo")
	}
}

作者：kyleconro 项目：frantic-searc

func TestFlatten(t *testing.T) {
	s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
	doc, _ := html.Parse(strings.NewReader(s))
	if Flatten(doc) != "Links:FooBarBaz" {
		t.Fatalf("%s was wrong", Flatten(doc))
	}
}

作者：sejohar 项目：kickerstat

func GenerateDocument(rawData []byte) *goquery.Document {
	utf8String := toUtf8(rawData)
	utf8byteArray := []byte(utf8String)
	node, err := html.Parse(bytes.NewReader(utf8byteArray))
	helper.HandleFatalError("document generation failed:", err)
	return goquery.NewDocumentFromNode(node)
}

作者：yeah-righ 项目：webho

// Make a GET request to the given URL and start parsing
// its HTML.
func ExtractData(entity *Entity, url string) {
	// Parsing completion channel.
	done := make(chan bool, 1)

	res, err := http.Get(url)
	if err != nil {
		log.Panicln("Error requesting URL data: ", err)
	}

	defer res.Body.Close()

	doc, err := html.Parse(res.Body)
	if err != nil {
		log.Println("Error parsing URL body: ", err)
	}

	go ParseHTML(doc, entity, done)

	for {
		select {
		case <-done:
			go finalizeEntity(entity, doc, EntityDir)
		default:
		}
	}
}

作者：shurcooL-legac 项目：gophurl

func lookupTitle(url string) (title string) {
	r, err := http.Get(url)
	if err != nil {
		return "<Couldn't connect.>"
	}
	defer r.Body.Close()
	/*b, err := ioutil.ReadAll(r.Body)
	CheckError(err)
	if len(b) > 30 {
		b = b[:30]
	}
	return string(b)*/

	title = "<Untitled page.>"

	doc, err := html.Parse(r.Body)
	if err != nil {
		return "<Failed to parse HTML.>"
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.DataAtom == atom.Title {
			title = extract(n)
			return
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)

	return
}

作者：Tanne 项目：isgtwifidown.co

func GetStatus() (Status, error) {
	resp, err := http.Get(STATUS_URL)
	if err != nil {
		log.Println(err)

		return Status{}, errors.New("Could not access OIT status page")
	}

	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)

	statusNode, err := FindStatusBlock(doc)
	if err != nil {
		log.Println(err)

		return Status{}, err
	}

	status, err := ExtractStatus(statusNode)
	if err != nil {
		log.Println(err)

		return Status{}, err
	}

	reason, err := ExtractReason(statusNode)
	if err != nil {
		log.Println(err)

		return Status{}, err
	}

	return Status{status, reason}, nil
}

作者：pavel 项目：gors

func (e *Embedder) embedRedditSelf(url string) (rv EmbedInfo, err error) {
	matched, err := regexp.MatchString("reddit.com/r/", url)
	if err != nil {
		return
	}
	if !matched {
		err = strategyWhiffError
		return
	}
	rv.URL = url
	doc, err := goquery.NewDocument(url)
	if err != nil {
		return
	}
	doc.Find(".expando .usertext-body").Each(func(i int, s *goquery.Selection) {
		s.Find("a").Each(func(i int, s *goquery.Selection) {
			if href, ok := s.Attr("href"); ok {
				embedInfo, err := e.embedImage(href)
				if err != nil {
					return
				}
				node, err := html.Parse(strings.NewReader(embedInfo.Html))
				if err != nil {
					return
				}
				parent := s.Parent().Get(0)
				parent.RemoveChild(s.Get(0))
				parent.AppendChild(node)
			}
		})
		rv.Html, err = s.Html()
		return
	})
	return
}

作者：gitvo 项目：traffic_contro

func GetParameters(client *http.Client, site string) error {
	url, err := url.ParseRequestURI(site)
	if err != nil {
		return err
	}

	url.Path = "/parameters/profile/all"

	respBody, err := DoRequest(client, url, "GET", nil, nil)
	if err != nil {
		return err
	}

	defer respBody.Close()

	doc, err := html.Parse(respBody)
	if err != nil {
		return err
	}

	if verbose {
		fmt.Println("HTML doc parsed ok", "type:", doc.Type, "data:", doc.Data)
	}

	err = CheckHtml(doc, PARAMETERS_PAGE_TITLE)
	if err != nil {
		return err
	}

	return nil
}

作者：ksre 项目：gofinanc

func getQuote(symbol string) (*fquery.Quote, error) {
	resp, err := http.Get("http://www.bloomberg.com/quote/" + symbol)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)
	if err != nil {
		return nil, err
	}

	/* TODO: detect if fund or plain stock, different layouts... */
	quote := &bloomQuote{}
	walk(doc, quote)

	return &fquery.Quote{
		Name:             quote.Name,
		Symbol:           symbol,
		Updated:          time.Now(),
		Volume:           quote.Volume,
		Open:             quote.Open,
		PreviousClose:    quote.PrevClose,
		DayLow:           quote.DayLow,
		DayHigh:          quote.DayHigh,
		YearLow:          quote.YearLow,
		YearHigh:         quote.YearHigh,
		LastTradePrice:   quote.LastTradePrice,
		DividendYield:    quote.DividendYield,
		EarningsPerShare: quote.EarningsPerShare,
		DividendExDate:   quote.DividendExDate,
	}, nil
}

作者：kwm 项目：gosearc

//Google 画像検索(未使用)
//  http://godoc.org/code.google.com/p/go.net/html
// にのっているサンプルにParse部分を追加
func ParseGoogleImageSearch(w http.ResponseWriter, r io.Reader) {
	doc, err := html.Parse(r)
	if err != nil {
		log.Fatal(err)
	}
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" {
					str := a.Val
					if strings.Contains(str, "imgurl") {
						strs := strings.Split(str, "&")
						imageurl := strings.Split(strs[0], "=")
						img := imageurl[1]
						fmt.Fprintf(w, "<html><body><ul><li><a href=%v><img src=%v></a></li></ul></body></html>", img, img)
					}
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)
}

作者：GDX 项目：syncthin

func main() {
	fd, err := os.Open(os.Args[1])
	if err != nil {
		log.Fatal(err)
	}
	err = json.NewDecoder(fd).Decode(&trans)
	if err != nil {
		log.Fatal(err)
	}
	fd.Close()

	fd, err = os.Open(os.Args[2])
	if err != nil {
		log.Fatal(err)
	}
	doc, err := html.Parse(fd)
	if err != nil {
		log.Fatal(err)
	}
	fd.Close()

	generalNode(doc)
	bs, err := json.MarshalIndent(trans, "", "   ")
	if err != nil {
		log.Fatal(err)
	}
	os.Stdout.Write(bs)
	os.Stdout.WriteString("\n")
}

作者：kck32 项目：gotool

func ParseAndPrint() map[string]string {
	//TODO : Take this url as parameter
	res, err := http.Get("http://sfbay.craigslist.org/search/apa/pen?query=&zoomToPosting=&srchType=A&minAsk=&maxAsk=2500&bedrooms=2&housing_type=&nh=77&nh=79&nh=81&nh=83&nh=84&nh=87")
	if err != nil {
		log.Fatal(err)
	}
	body, err := ioutil.ReadAll(res.Body)
	res.Body.Close()
	doc, err := html.Parse(strings.NewReader(string(body)))
	if err != nil {
		log.Fatal(err)
	}
	returnUrl := make(map[string]string)
	var checkForListings func(*html.Node)
	checkForListings = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" && strings.HasPrefix(a.Val, "/pen/apa") {
					if n.FirstChild != nil {
						returnUrl["http://sfbay.craigslist.org"+a.Val] = n.FirstChild.Data
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			checkForListings(c)
		}
	}
	checkForListings(doc)
	return returnUrl
}

作者：vigneshsarm 项目：crawle

func crawl() {
	for seed := range urlQue {
		defer func() {
			if r := recover(); r != nil {
				log.Println("Recovered in crawl", r, len(urlQue), len(result), seed)
			}
		}()
		// log.Println(seed,seed.Scheme,seed.Host,seed.Path)
		resp, err := http.Get(seed.String())
		defer resp.Body.Close()
		if err != nil {
			log.Printf("some error occured  %s\n", err)
		}
		if resp.StatusCode == 200 {
			// body, _ := ioutil.ReadAll(resp.Body)
			// log.Printf("Respones %s\n", body);
			z, err := html.Parse(resp.Body)
			if err != nil {
				log.Fatal(err)
			}
			if len(result) < maxResult-2 {
				result <- &CrawledResult{seed, z}
			} else {
				log.Println("result queue almost at max")
			}

		} else {
			log.Printf("Respones %s\n", resp)
		}
	}
}

Golang code-google-com-p-go-net-html.Parse类(方法)实例源码