-
Notifications
You must be signed in to change notification settings - Fork 1
/
goout.rb
executable file
·74 lines (64 loc) · 1.84 KB
/
goout.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env ruby
require 'kimurai'
require 'json'
CRAWLER_DATA_DIR = ENV['CRAWLERS_DATA_DIR']
LINK_DOMAIN = 'https://goout.net'.freeze
FILE_NAME = 'goout_newly_announced.json'.freeze
SORT_METHOD = 'sort=newly_announced'.freeze
class GoOutSpider < Kimurai::Base
@name = 'GoOut_Spider'
@engine = :mechanize
@categories = %w[
events
concerts
plays
exhibitions
movies
parties
festivals
culinary
for-children
other-events
]
@start_urls = @categories.map do |category|
{
url: "#{LINK_DOMAIN}/en/prague/#{category}/?#{SORT_METHOD}",
data: "#{CRAWLER_DATA_DIR}/#{category}_#{FILE_NAME}"
}
end
def create_card(card)
OpenStruct.new(
name: card.css('span[itemprop=name].name').text.squish,
link: "#{LINK_DOMAIN}#{card.css('a').attribute('href')}",
venue: card.css('.venue span[itemprop=name]').text.squish,
venueLink: "#{LINK_DOMAIN}#{card.css('.venue span[itemprop=geo]')
.attribute('data-venue-href')}",
dateTime: card.css('time').attribute('datetime'),
scrapeDate: Time.now.to_s
)
end
def load_data(data_path)
return unless File.file?(data_path)
file = File.read data_path
JSON.parse(file).map { |record| OpenStruct.new(record) }
end
def ensure_data_dir
Dir.mkdir(CRAWLER_DATA_DIR) unless Dir.exist? CRAWLER_DATA_DIR
end
def parse(response, url)
loaded = load_data url[:data]
count_updated = 0
response.css('.eventCard .info').each do |card|
event = create_card card
if loaded
found = loaded.detect { |item| item.link == event.link }
event.scrapeDate = found.scrapeDate if found
end
count_updated += 1 unless found
ensure_data_dir
save_to url[:data], event.to_h, format: :pretty_json
end
puts "Update #{count_updated} items."
end
end
GoOutSpider.crawl!