From 42a7d35f86ce94facce9a65fc435f96bcb0e522f Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 3 Dec 2018 14:55:44 +0800 Subject: [PATCH] Add website name and icon url for output --- README.md | 8 ++++++-- goscraper.go | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 46ee5b3..35d61e6 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,22 @@ fmt.Println(err) return } + fmt.Printf("Icon : %s\n", s.Preview.Icon) + fmt.Printf("Name : %s\n", s.Preview.Name) fmt.Printf("Title : %s\n", s.Preview.Title) fmt.Printf("Description : %s\n", s.Preview.Description) fmt.Printf("Image: %s\n", s.Preview.Images[0]) fmt.Printf("Url : %s\n", s.Preview.Link) } -output: +output: +**Icon :** https://www.w3.org/favicon.ico +**Name :** www.w3.org **Title :** World Wide Web Consortium (W3C) **Description :** The World Wide Web Consortium (W3C) is an international community where Member organizations, a full-time staff, and the public work together to develop Web standards. **Image:** https://www.w3.org/2008/site/images/logo-w3c-mobile-lg -**Url :** https://www.w3.org/ +**Url :** https://www.w3.org/ ## License diff --git a/goscraper.go b/goscraper.go index eaa5a46..9c03bb9 100644 --- a/goscraper.go +++ b/goscraper.go @@ -30,6 +30,8 @@ type Document struct { } type DocumentPreview struct { + Icon string + Name string Title string Description string Images []string @@ -165,6 +167,10 @@ func (scraper *Scraper) parseDocument(doc *Document) error { doc.Preview.Images = []string{} // saves previews' link in case that is found after link := doc.Preview.Link + // set default value to site name if not found + doc.Preview.Name = scraper.Url.Host + // set default icon to web root if not found + doc.Preview.Icon = fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, "/favicon.ico") for { tokenType := t.Next() if tokenType == html.ErrorToken { @@ -185,11 +191,15 @@ func (scraper *Scraper) parseDocument(doc *Document) error { case "link": var canonical bool + var hasIcon bool var href string for _, attr := range token.Attr { if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "canonical" { canonical = true } + if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "icon" { + hasIcon = true + } if cleanStr(attr.Key) == "href" { href = attr.Val } @@ -201,6 +211,9 @@ func (scraper *Scraper) parseDocument(doc *Document) error { return err } } + if len(href) > 0 && hasIcon { + doc.Preview.Icon = href + } } case "meta": @@ -221,6 +234,8 @@ func (scraper *Scraper) parseDocument(doc *Document) error { } } switch cleanStr(property) { + case "og:site_name": + doc.Preview.Name = content case "og:title": doc.Preview.Title = content case "og:description":