作者:john-griffi
项目:bcoo
func FetchFullDescription(link string) string {
res, err := http.Get(link)
if err != nil {
log.Fatal(err)
}
body, err := ioutil.ReadAll(res.Body)
res.Body.Close()
if err != nil {
log.Fatal(err)
}
doc, err := html.Parse(strings.NewReader(string(body)))
content := ""
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "section" {
for _, a := range n.Attr {
if a.Key == "class" && a.Val == "entry-content cf" {
var buf bytes.Buffer
html.Render(&buf, n)
content = buf.String()
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return content
}
作者:fjj59689437
项目:blog
func GetEntries(root string, useSummary bool) (entries []*Entry, err error) {
filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
if strings.ToLower(filepath.Ext(path)) != ".txt" {
return nil
}
entry, _ := GetEntry(path)
if entry == nil {
return nil
}
entries = append(entries, entry)
if useSummary {
doc, err := html.Parse(strings.NewReader(entry.Body))
if err == nil {
if text, err := toText(doc); err == nil {
if len(text) > 500 {
text = text[0:500] + "..."
}
entry.Body = text
}
}
}
entry.Id = entry.Filename[len(root):len(entry.Filename)-3] + "html"
return nil
})
return
}
作者:heyL
项目:l
func GetFeedUrl(u string) (string, error) {
resp, err := http.Get(u)
if err != nil {
return "", err
}
if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
return u, nil
}
tree, err := html.Parse(resp.Body)
if err != nil {
return "", err
}
sel := cascadia.MustCompile("link[rel=alternate][type*=xml]")
alt := sel.MatchFirst(tree)
if alt == nil {
return "", errors.New("no feed link found")
}
altUrl, found := FindAttr("href", alt.Attr)
if !found {
return "", errors.New("missing link in alternate")
}
return ToAbsolute(resp.Request.URL, altUrl.Val), nil
}
作者:eaburn
项目:feedm
// FixHtml parses bytes as HTML and returns well-formed HTML if the parse
// was successful, or escaped HTML, if not.
func fixHtml(linkUrl string, wild []byte) (well []byte) {
n, err := html.Parse(bytes.NewReader(wild))
if err != nil {
return []byte(html.EscapeString(string(wild)))
}
fixImgs(linkUrl, n)
defer func() {
if err := recover(); err == bytes.ErrTooLarge {
well = []byte(html.EscapeString(string(wild)))
} else if err != nil {
panic(err)
}
}()
buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2))
if err := html.Render(buf, n); err != nil {
return []byte(html.EscapeString(string(wild)))
}
well = buf.Bytes()
openBody := []byte("<body>")
i := bytes.Index(well, openBody)
if i < 0 {
return []byte(html.EscapeString(string(wild)))
}
well = well[i+len(openBody):]
closeBody := []byte("</body>")
i = bytes.Index(well, closeBody)
if i < 0 {
return []byte(html.EscapeString(string(wild)))
}
return well[:i]
}
作者:jShi-gi
项目:goquery_sampl
func TestSelectors(t *testing.T) {
for _, test := range selectorTests {
s, err := Compile(test.selector)
if err != nil {
t.Errorf("error compiling %q: %s", test.selector, err)
continue
}
doc, err := html.Parse(strings.NewReader(test.HTML))
if err != nil {
t.Errorf("error parsing %q: %s", test.HTML, err)
continue
}
matches := s.MatchAll(doc)
if len(matches) != len(test.results) {
t.Errorf("wanted %d elements, got %d instead", len(test.results), len(matches))
continue
}
for i, m := range matches {
got := nodeString(m)
if got != test.results[i] {
t.Errorf("wanted %s, got %s instead", test.results[i], got)
}
}
}
}
作者:98p
项目:docke
func ExampleParse() {
s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
doc, err := html.Parse(strings.NewReader(s))
if err != nil {
log.Fatal(err)
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
fmt.Println(a.Val)
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
// Output:
// foo
// /bar/baz
}
作者:hdonna
项目:dereddi
func parseStub(stub string) (r redditStub, err error) {
var extract func(*html.Node)
var doc *html.Node
doc, err = html.Parse(strings.NewReader(stub))
if err != nil {
return
}
extract = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
switch {
case n.FirstChild.Data == "[link]":
r.Link = n.Attr[0].Val
case strings.HasSuffix(n.FirstChild.Data, " comments]"):
r.Comments = n.Attr[0].Val
case strings.HasPrefix(n.Attr[0].Val, "http://www.reddit.com/user/"):
r.User = strings.TrimSpace(n.FirstChild.Data)
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
extract(c)
}
}
extract(doc)
return
}
作者:kyleconro
项目:frantic-searc
func TestFind(t *testing.T) {
s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a class="goo" href="/bar/baz">BarBaz</a></ul>`
doc, _ := html.Parse(strings.NewReader(s))
_, found := Find(doc, "#foo")
if found {
t.Errorf("There is no node with id 'foo'")
}
p, found := Find(doc, "p")
if !found || p.Data != "p" {
t.Errorf("Couldn't find p")
}
a, found := Find(doc, "ul a")
if !found || a.Data != "a" || Flatten(a) != "Foo" {
t.Errorf("Couldn't find a")
}
goo, found := Find(doc, "ul .goo")
if !found || goo.Data != "a" || Flatten(goo) != "BarBaz" {
t.Errorf("Couldn't find a with class goo")
}
}
作者:kyleconro
项目:frantic-searc
func TestFlatten(t *testing.T) {
s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
doc, _ := html.Parse(strings.NewReader(s))
if Flatten(doc) != "Links:FooBarBaz" {
t.Fatalf("%s was wrong", Flatten(doc))
}
}
作者:sejohar
项目:kickerstat
func GenerateDocument(rawData []byte) *goquery.Document {
utf8String := toUtf8(rawData)
utf8byteArray := []byte(utf8String)
node, err := html.Parse(bytes.NewReader(utf8byteArray))
helper.HandleFatalError("document generation failed:", err)
return goquery.NewDocumentFromNode(node)
}
作者:yeah-righ
项目:webho
// Make a GET request to the given URL and start parsing
// its HTML.
func ExtractData(entity *Entity, url string) {
// Parsing completion channel.
done := make(chan bool, 1)
res, err := http.Get(url)
if err != nil {
log.Panicln("Error requesting URL data: ", err)
}
defer res.Body.Close()
doc, err := html.Parse(res.Body)
if err != nil {
log.Println("Error parsing URL body: ", err)
}
go ParseHTML(doc, entity, done)
for {
select {
case <-done:
go finalizeEntity(entity, doc, EntityDir)
default:
}
}
}
作者:shurcooL-legac
项目:gophurl
func lookupTitle(url string) (title string) {
r, err := http.Get(url)
if err != nil {
return "<Couldn't connect.>"
}
defer r.Body.Close()
/*b, err := ioutil.ReadAll(r.Body)
CheckError(err)
if len(b) > 30 {
b = b[:30]
}
return string(b)*/
title = "<Untitled page.>"
doc, err := html.Parse(r.Body)
if err != nil {
return "<Failed to parse HTML.>"
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.DataAtom == atom.Title {
title = extract(n)
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return
}
作者:Tanne
项目:isgtwifidown.co
func GetStatus() (Status, error) {
resp, err := http.Get(STATUS_URL)
if err != nil {
log.Println(err)
return Status{}, errors.New("Could not access OIT status page")
}
defer resp.Body.Close()
doc, err := html.Parse(resp.Body)
statusNode, err := FindStatusBlock(doc)
if err != nil {
log.Println(err)
return Status{}, err
}
status, err := ExtractStatus(statusNode)
if err != nil {
log.Println(err)
return Status{}, err
}
reason, err := ExtractReason(statusNode)
if err != nil {
log.Println(err)
return Status{}, err
}
return Status{status, reason}, nil
}
作者:pavel
项目:gors
func (e *Embedder) embedRedditSelf(url string) (rv EmbedInfo, err error) {
matched, err := regexp.MatchString("reddit.com/r/", url)
if err != nil {
return
}
if !matched {
err = strategyWhiffError
return
}
rv.URL = url
doc, err := goquery.NewDocument(url)
if err != nil {
return
}
doc.Find(".expando .usertext-body").Each(func(i int, s *goquery.Selection) {
s.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok {
embedInfo, err := e.embedImage(href)
if err != nil {
return
}
node, err := html.Parse(strings.NewReader(embedInfo.Html))
if err != nil {
return
}
parent := s.Parent().Get(0)
parent.RemoveChild(s.Get(0))
parent.AppendChild(node)
}
})
rv.Html, err = s.Html()
return
})
return
}
作者:gitvo
项目:traffic_contro
func GetParameters(client *http.Client, site string) error {
url, err := url.ParseRequestURI(site)
if err != nil {
return err
}
url.Path = "/parameters/profile/all"
respBody, err := DoRequest(client, url, "GET", nil, nil)
if err != nil {
return err
}
defer respBody.Close()
doc, err := html.Parse(respBody)
if err != nil {
return err
}
if verbose {
fmt.Println("HTML doc parsed ok", "type:", doc.Type, "data:", doc.Data)
}
err = CheckHtml(doc, PARAMETERS_PAGE_TITLE)
if err != nil {
return err
}
return nil
}
作者:ksre
项目:gofinanc
func getQuote(symbol string) (*fquery.Quote, error) {
resp, err := http.Get("http://www.bloomberg.com/quote/" + symbol)
if err != nil {
return nil, err
}
defer resp.Body.Close()
doc, err := html.Parse(resp.Body)
if err != nil {
return nil, err
}
/* TODO: detect if fund or plain stock, different layouts... */
quote := &bloomQuote{}
walk(doc, quote)
return &fquery.Quote{
Name: quote.Name,
Symbol: symbol,
Updated: time.Now(),
Volume: quote.Volume,
Open: quote.Open,
PreviousClose: quote.PrevClose,
DayLow: quote.DayLow,
DayHigh: quote.DayHigh,
YearLow: quote.YearLow,
YearHigh: quote.YearHigh,
LastTradePrice: quote.LastTradePrice,
DividendYield: quote.DividendYield,
EarningsPerShare: quote.EarningsPerShare,
DividendExDate: quote.DividendExDate,
}, nil
}
作者:kwm
项目:gosearc
//Google 画像検索(未使用)
// http://godoc.org/code.google.com/p/go.net/html
// にのっているサンプルにParse部分を追加
func ParseGoogleImageSearch(w http.ResponseWriter, r io.Reader) {
doc, err := html.Parse(r)
if err != nil {
log.Fatal(err)
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
str := a.Val
if strings.Contains(str, "imgurl") {
strs := strings.Split(str, "&")
imageurl := strings.Split(strs[0], "=")
img := imageurl[1]
fmt.Fprintf(w, "<html><body><ul><li><a href=%v><img src=%v></a></li></ul></body></html>", img, img)
}
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
}
作者:GDX
项目:syncthin
func main() {
fd, err := os.Open(os.Args[1])
if err != nil {
log.Fatal(err)
}
err = json.NewDecoder(fd).Decode(&trans)
if err != nil {
log.Fatal(err)
}
fd.Close()
fd, err = os.Open(os.Args[2])
if err != nil {
log.Fatal(err)
}
doc, err := html.Parse(fd)
if err != nil {
log.Fatal(err)
}
fd.Close()
generalNode(doc)
bs, err := json.MarshalIndent(trans, "", " ")
if err != nil {
log.Fatal(err)
}
os.Stdout.Write(bs)
os.Stdout.WriteString("\n")
}
作者:kck32
项目:gotool
func ParseAndPrint() map[string]string {
//TODO : Take this url as parameter
res, err := http.Get("http://sfbay.craigslist.org/search/apa/pen?query=&zoomToPosting=&srchType=A&minAsk=&maxAsk=2500&bedrooms=2&housing_type=&nh=77&nh=79&nh=81&nh=83&nh=84&nh=87")
if err != nil {
log.Fatal(err)
}
body, err := ioutil.ReadAll(res.Body)
res.Body.Close()
doc, err := html.Parse(strings.NewReader(string(body)))
if err != nil {
log.Fatal(err)
}
returnUrl := make(map[string]string)
var checkForListings func(*html.Node)
checkForListings = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" && strings.HasPrefix(a.Val, "/pen/apa") {
if n.FirstChild != nil {
returnUrl["http://sfbay.craigslist.org"+a.Val] = n.FirstChild.Data
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
checkForListings(c)
}
}
checkForListings(doc)
return returnUrl
}
作者:vigneshsarm
项目:crawle
func crawl() {
for seed := range urlQue {
defer func() {
if r := recover(); r != nil {
log.Println("Recovered in crawl", r, len(urlQue), len(result), seed)
}
}()
// log.Println(seed,seed.Scheme,seed.Host,seed.Path)
resp, err := http.Get(seed.String())
defer resp.Body.Close()
if err != nil {
log.Printf("some error occured %s\n", err)
}
if resp.StatusCode == 200 {
// body, _ := ioutil.ReadAll(resp.Body)
// log.Printf("Respones %s\n", body);
z, err := html.Parse(resp.Body)
if err != nil {
log.Fatal(err)
}
if len(result) < maxResult-2 {
result <- &CrawledResult{seed, z}
} else {
log.Println("result queue almost at max")
}
} else {
log.Printf("Respones %s\n", resp)
}
}
}