Skip to content

Commit

Permalink
Add support for "deeper" search of optional property. (#5)
Browse files Browse the repository at this point in the history
For some ULRs was found that we can't get for strange reason the `og:type` data.
One of this ULRs - was youtube links.
Was detected that in YouTube they keep metadata in body (and not in head as other normal services).
And because previously the criteria for breaking loop of procession of tokens was "we have Title + description + ogImage and we passed head" - we were not able to process all other optional meta after that we pass head.

Now we are able to control how much tokens we can process before breaking loop (or if we found required optional fields already)
  • Loading branch information
Alexandr Filioglo authored Jan 18, 2021
1 parent 38c9e83 commit 44a43d8
Showing 1 changed file with 26 additions and 5 deletions.
31 changes: 26 additions & 5 deletions goscraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ type scrapeSettings struct {
maxDocumentLength int64
url string
maxRedirect int
maxTokenDepth int
}

type ScrapeBuilder interface {
Expand All @@ -32,6 +33,7 @@ type ScrapeBuilder interface {
SetUrl(string) ScrapeBuilder
SetMaxRedirect(int) ScrapeBuilder
Build() (ScrapeService, error)
SetMaxTokenDepth(int) ScrapeBuilder
}

type scrapeBuilder struct {
Expand All @@ -49,6 +51,7 @@ func (b *scrapeBuilder) Build() (ScrapeService, error) {
Options: ScraperOptions{
MaxDocumentLength: b.scrapeSettings.maxDocumentLength,
UserAgent: b.scrapeSettings.userAgent,
MaxTokenDepth: b.scrapeSettings.maxTokenDepth,
}}, nil
}

Expand All @@ -62,6 +65,11 @@ func (b *scrapeBuilder) SetMaxRedirect(i int) ScrapeBuilder {
return b
}

func (b *scrapeBuilder) SetMaxTokenDepth(i int) ScrapeBuilder {
b.scrapeSettings.maxTokenDepth = i
return b
}

func (b *scrapeBuilder) SetMaxDocumentLength(maxDocLength int64) ScrapeBuilder {
b.scrapeSettings.maxDocumentLength = maxDocLength
return b
Expand All @@ -81,6 +89,7 @@ func NewScrapeBuilder() ScrapeBuilder {
type ScraperOptions struct {
MaxDocumentLength int64
UserAgent string
MaxTokenDepth int
}

type Scraper struct {
Expand Down Expand Up @@ -286,7 +295,8 @@ func convertUTF8(content io.Reader, contentType string) (bytes.Buffer, error) {

func (scraper *Scraper) parseDocument(doc *Document) error {
t := html.NewTokenizer(&doc.Body)
var ogImage bool
var hasOgImage bool
var hasOgType bool
var headPassed bool
var hasFragment bool
var hasCanonical bool
Expand All @@ -298,6 +308,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
doc.Preview.Name = scraper.Url.Host
// set default icon to web root if <link rel="icon" href="/favicon.ico"> not found
doc.Preview.Icon = fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, "/favicon.ico")
depth := 0
for {
tokenType := t.Next()
if tokenType == html.ErrorToken {
Expand Down Expand Up @@ -342,6 +353,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
doc.Preview.Icon = href
}
}
depth = 0

case "meta":
if len(token.Attr) != 2 {
Expand All @@ -367,6 +379,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
doc.Preview.Title = content
case "og:type":
doc.Preview.Type = content
hasOgType = true
case "og:description":
doc.Preview.Description = content
case "description":
Expand All @@ -376,7 +389,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
case "og:url":
doc.Preview.Link = content
case "og:image":
ogImage = true
hasOgImage = true
ogImgUrl, err := url.Parse(content)
if err != nil {
return err
Expand All @@ -389,6 +402,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
doc.Preview.Images = []string{ogImgUrl.String()}

}
depth = 0

case "title":
if tokenType == html.StartTagToken {
Expand All @@ -398,6 +412,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
doc.Preview.Title = token.Data
}
}
depth = 0

case "img":
for _, attr := range token.Attr {
Expand All @@ -418,6 +433,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {

}
}
depth = 0
}

if hasCanonical && headPassed && scraper.MaxRedirect > 0 {
Expand Down Expand Up @@ -448,10 +464,15 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
return scraper.parseDocument(doc)
}

if len(doc.Preview.Title) > 0 && len(doc.Preview.Description) > 0 && ogImage && headPassed {
return nil
if len(doc.Preview.Title) > 0 && len(doc.Preview.Description) > 0 && hasOgImage && headPassed {
if scraper.Options.MaxTokenDepth == 0 {
return nil
}
if hasOgType || depth >= scraper.Options.MaxTokenDepth {
return nil
}
depth++
}

}

return nil
Expand Down

0 comments on commit 44a43d8

Please sign in to comment.