-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadscraper.go
76 lines (60 loc) · 1.56 KB
/
adscraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package adscraper
import (
"net/http"
"strings"
"github.com/PuerkitoBio/goquery"
)
func NewURL(s string) string {
return "https://www.google.com/search?q=" + strings.Replace(s, " ", "+", -1)
}
func Scrape(url string) ([]*Ad, error) {
c := &crawler{}
res, err := c.Fetch(url)
if err != nil {
return nil, err
}
defer res.Body.Close()
return extract(res)
}
func extract(r *http.Response) ([]*Ad, error) {
var ads = make([]*Ad, 0)
doc, err := goquery.NewDocumentFromResponse(r)
if err != nil {
return ads, err
}
doc.Find(".ads-ad").Each(func(i int, sel *goquery.Selection) {
ads = append(ads, extractAd(i+1, sel))
})
return ads, err
}
func extractAd(pos int, sel *goquery.Selection) *Ad {
ad := &Ad{}
ad.Position = pos
ad.H1, ad.H2 = splitHead(sel.Find("h3 > a").Text())
ad.Path = strings.TrimSpace(sel.Find(".ads-visurl cite").Text())
descSel := sel.Find(".ads-creative")
ad.Desc = strings.TrimSpace(descSel.Text())
ad.SetRest(innerHTML(descSel))
raw, _ := goquery.OuterHtml(sel)
ad.SetRaw(raw)
return ad
}
func splitHead(head string) (h1 string, h2 string) {
h1, h2 = "", ""
if h := strings.SplitN(head, "-", 2); len(h) > 1 {
h1 = normalize(strings.TrimSpace(h[0]))
h2 = normalize(strings.TrimSpace(h[1]))
}
return
}
func normalize(s string) string {
return strings.Replace(s, "\u200e", "", -1)
}
func innerHTML(sel *goquery.Selection) string {
html := make([]string, 0)
sel.NextAll().Each(func(i int, sel *goquery.Selection) {
h, _ := goquery.OuterHtml(sel)
html = append(html, strings.TrimSpace(h))
})
return strings.Join(html, "")
}