-
Notifications
You must be signed in to change notification settings - Fork 1
/
puttyandpaint.rb
61 lines (51 loc) · 1.43 KB
/
puttyandpaint.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env ruby
require 'kimurai'
require 'json'
CRAWLER_DATA_DIR = ENV['CRAWLERS_DATA_DIR']
LINK_DOMAIN = 'https://puttyandpaint.com/projects/'.freeze
FILE_NAME = 'puttyandpaint.json'.freeze
class GoOutSpider < Kimurai::Base
@name = 'GoOut_Spider'
@engine = :mechanize
@categories = %w[
editors-choice
all-projects
]
@start_urls = @categories.map do |category|
{
url: "#{LINK_DOMAIN}#{category}",
data: "#{CRAWLER_DATA_DIR}/#{category}_#{FILE_NAME}"
}
end
def extract_data(card)
OpenStruct.new(
name: card.css('a').attribute('title'),
link: "#{card.css('a').attribute('href')}",
scrapeDate: Time.now.to_s
)
end
def load_data(data_path)
return unless File.file?(data_path)
file = File.read data_path
JSON.parse(file).map { |record| OpenStruct.new(record) }
end
def ensure_data_dir
Dir.mkdir(CRAWLER_DATA_DIR) unless Dir.exist? CRAWLER_DATA_DIR
end
def parse(response, url)
loaded = load_data url[:data]
count_updated = 0
response.css('.project-list li').each do |card|
post = extract_data card
if loaded
found = loaded.detect { |item| item.link == post.link }
post.scrapeDate = found.scrapeDate if found
end
count_updated += 1 unless found
ensure_data_dir
save_to url[:data], post.to_h, format: :pretty_json
end
puts "Update #{count_updated} items."
end
end
GoOutSpider.crawl!