Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

For comment: Added a Cuprite/Ferrum driver for Chrome CDP support #53

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class GithubSpider < Kimurai::Base
}

def parse(response, url:, data: {})
response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a|
response.xpath("//ul[@class='repo-list']//a[@class='v-align-middle']").each do |a|
request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
end

Expand All @@ -36,7 +36,7 @@ class GithubSpider < Kimurai::Base
item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text
item[:repo_url] = url
item[:description] = response.xpath("//span[@itemprop='about']").text.squish
item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish }
item[:tags] = response.xpath("//div[starts-with(@class, 'list-topics-container')]/a").map { |a| a.text.squish }
item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish
item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish
item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish
Expand Down Expand Up @@ -1344,7 +1344,7 @@ end # =>

So what if you're don't care about stats and just want to process request to a particular spider method and get the returning value from this method? Use `.parse!` instead:

#### `.parse!(:method_name, url:)` method
#### `.parse!(:method_name, url:, config: {})` method

`.parse!` (class method) creates a new spider instance and performs a request to given method with a given url. Value from the method will be returned back:

Expand All @@ -1361,6 +1361,8 @@ end

ExampleSpider.parse!(:parse, url: "https://example.com/")
# => "Example Domain"
# this is example when you need to override config
ExampleSpider.parse!(:parse, url: "https://example.com/", config: { before_request: { clear_and_set_cookies: true } } )
```

Like `.crawl!`, `.parse!` method takes care of a browser instance and kills it (`browser.destroy_driver!`) before returning the value. Unlike `.crawl!`, `.parse!` method can be called from different threads at the same time:
Expand Down
6 changes: 4 additions & 2 deletions kimurai.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,18 @@ Gem::Specification.new do |spec|
spec.add_dependency "capybara-mechanize"
spec.add_dependency "poltergeist"
spec.add_dependency "selenium-webdriver"
spec.add_dependency "cuprite"

spec.add_dependency "headless"
spec.add_dependency "pmap"

spec.add_dependency "addressable"
spec.add_dependency "whenever"

spec.add_dependency "rbcat", "~> 0.2"
spec.add_dependency "pry"

spec.add_development_dependency "bundler", "~> 1.16"
spec.add_development_dependency "rake", "~> 10.0"
spec.add_development_dependency "bundler", "~> 2.1"
spec.add_development_dependency "rake", "~> 13.0"
spec.add_development_dependency "minitest", "~> 5.0"
end
12 changes: 10 additions & 2 deletions lib/kimurai/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,13 @@ def self.crawl!(exception_on_fail: true)
end

def self.parse!(handler, *args, **request)
spider = self.new
if request.has_key? :config
config = request[:config]
request.delete :config
else
config = {}
end
spider = self.new config: config

if args.present?
spider.public_send(handler, *args)
Expand Down Expand Up @@ -201,7 +207,9 @@ def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
return unless visited

public_send(handler, browser.current_response(response_type), { url: url, data: data })
options = { url: url, data: data }

public_send(handler, browser.current_response(response_type), **options)
end

def console(response = nil, url: nil, data: {})
Expand Down
6 changes: 4 additions & 2 deletions lib/kimurai/base_helper.rb
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
require 'addressable/uri'

module Kimurai
module BaseHelper
private

def absolute_url(url, base:)
return unless url
URI.join(base, URI.escape(url)).to_s
URI.join(base, Addressable::URI.escape(url)).to_s
end

def escape_url(url)
uri = URI.parse(url)
rescue URI::InvalidURIError => e
URI.parse(URI.escape url).to_s rescue url
URI.parse(Addressable::URI.escape(url)).to_s rescue url
else
url
end
Expand Down
201 changes: 201 additions & 0 deletions lib/kimurai/browser_builder/cuprite_builder.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
require 'capybara'
require 'capybara/cuprite'
require_relative '../capybara_configuration'
require_relative '../capybara_ext/cuprite/driver'
require_relative '../capybara_ext/session'

module Kimurai::BrowserBuilder
class CupriteBuilder
class << self
attr_accessor :virtual_display
end

attr_reader :logger, :spider

def initialize(config, spider:)
@config = config
@spider = spider
@logger = spider.logger
end

def build
# Register driver
Capybara.register_driver :cuprite do |app|
# Create driver options
# opts = { args: %w[--disable-gpu --no-sandbox --disable-translate] }

# Provide custom chrome browser path:
# if chrome_path = Kimurai.configuration.selenium_chrome_path
# opts.merge!(binary: chrome_path)
# end

# See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html
# driver_options = Selenium::WebDriver::Chrome::Options.new(opts)

# Window size
# if size = @config[:window_size].presence
# driver_options.args << "--window-size=#{size.join(',')}"
# logger.debug "BrowserBuilder (cuprite): enabled window_size"
# end

# Proxy
# if proxy = @config[:proxy].presence
# proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
# ip, port, type, user, password = proxy_string.split(":")
#
# if %w(http socks5).include?(type)
# if user.nil? && password.nil?
# driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
# logger.debug "BrowserBuilder (cuprite): enabled #{type} proxy, ip: #{ip}, port: #{port}"
# else
# logger.error "BrowserBuilder (cuprite): proxy with authentication doesn't supported by selenium, skipped"
# end
# else
# logger.error "BrowserBuilder (cuprite): wrong type of proxy: #{type}, skipped"
# end
# end
#
# if proxy_bypass_list = @config[:proxy_bypass_list].presence
# if proxy
# driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}"
# logger.debug "BrowserBuilder (cuprite): enabled proxy_bypass_list"
# else
# logger.error "BrowserBuilder (cuprite): provide `proxy` to set proxy_bypass_list, skipped"
# end
# end

# SSL
# if @config[:ignore_ssl_errors].present?
# driver_options.args << "--ignore-certificate-errors"
# driver_options.args << "--allow-insecure-localhost"
# logger.debug "BrowserBuilder (cuprite): enabled ignore_ssl_errors"
# end

# Disable images
# if @config[:disable_images].present?
# driver_options.prefs["profile.managed_default_content_settings.images"] = 2
# logger.debug "BrowserBuilder (cuprite): enabled disable_images"
# end

# Headers
# if @config[:headers].present?
# logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped"
# end

# if user_agent = @config[:user_agent].presence
# user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
# driver_options.args << "--user-agent='#{user_agent_string}'"
# logger.debug "BrowserBuilder (cuprite): enabled custom user_agent"
# end

# Headless mode
# if ENV["HEADLESS"] != "false"
# if @config[:headless_mode] == :virtual_display
# if Gem::Platform.local.os == "linux"
# unless self.class.virtual_display
# require 'headless'
# self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
# self.class.virtual_display.start
# end
#
# logger.debug "BrowserBuilder (cuprite): enabled virtual_display headless_mode"
# else
# logger.error "BrowserBuilder (cuprite): virtual_display headless_mode works only " \
# "on Linux platform. Browser will run in normal mode. Set `native` mode instead."
# end
# else
# driver_options.args << "--headless"
# logger.debug "BrowserBuilder (cuprite): enabled native headless_mode"
# end
# end

# chromedriver_path = Kimurai.configuration.chromedriver_path || "/usr/local/bin/chromedriver"
# service = Selenium::WebDriver::Service.chrome(path: chromedriver_path)
# Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service)
# Capybara::Cuprite::Driver.new(app, window_size: window_size[1200, 800])
Capybara::Cuprite::Driver.new(app)
end

# Create browser instance (Capybara session)
@browser = Capybara::Session.new(:cuprite)
@browser.spider = spider
logger.debug "BrowserBuilder (cuprite): created browser instance"

# if @config[:extensions].present?
# logger.error "BrowserBuilder (cuprite): `extensions` option not supported by Selenium, skipped"
# end

# Cookies
if cookies = @config[:cookies].presence
@browser.config.cookies = cookies
logger.debug "BrowserBuilder (cuprite): enabled custom cookies"
end

# Browser instance options
# skip_request_errors
if skip_errors = @config[:skip_request_errors].presence
@browser.config.skip_request_errors = skip_errors
logger.debug "BrowserBuilder (cuprite): enabled skip_request_errors"
end

# retry_request_errors
if retry_errors = @config[:retry_request_errors].presence
@browser.config.retry_request_errors = retry_errors
logger.debug "BrowserBuilder (cuprite): enabled retry_request_errors"
end

# restart_if
if requests_limit = @config.dig(:restart_if, :requests_limit).presence
@browser.config.restart_if[:requests_limit] = requests_limit
logger.debug "BrowserBuilder (cuprite): enabled restart_if.requests_limit >= #{requests_limit}"
end

# if memory_limit = @config.dig(:restart_if, :memory_limit).presence
# @browser.config.restart_if[:memory_limit] = memory_limit
# logger.debug "BrowserBuilder (cuprite): enabled restart_if.memory_limit >= #{memory_limit}"
# end

# before_request clear_cookies
if @config.dig(:before_request, :clear_cookies)
@browser.config.before_request[:clear_cookies] = true
logger.debug "BrowserBuilder (cuprite): enabled before_request.clear_cookies"
end

# before_request clear_and_set_cookies
if @config.dig(:before_request, :clear_and_set_cookies)
if cookies = @config[:cookies].presence
@browser.config.cookies = cookies
@browser.config.before_request[:clear_and_set_cookies] = true
logger.debug "BrowserBuilder (cuprite): enabled before_request.clear_and_set_cookies"
else
logger.error "BrowserBuilder (cuprite): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
end
end

# before_request change_user_agent
if @config.dig(:before_request, :change_user_agent)
logger.error "BrowserBuilder (cuprite): before_request.change_user_agent option not supported by Selenium, skipped"
end

# before_request change_proxy
if @config.dig(:before_request, :change_proxy)
logger.error "BrowserBuilder (cuprite): before_request.change_proxy option not supported by Selenium, skipped"
end

# before_request delay
if delay = @config.dig(:before_request, :delay).presence
@browser.config.before_request[:delay] = delay
logger.debug "BrowserBuilder (cuprite): enabled before_request.delay"
end

# encoding
if encoding = @config[:encoding]
@browser.config.encoding = encoding
logger.debug "BrowserBuilder (cuprite): enabled encoding: #{encoding}"
end

# return Capybara session instance
@browser
end
end
end
9 changes: 9 additions & 0 deletions lib/kimurai/capybara_ext/cuprite/driver.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
require_relative '../driver/base'

module Capybara::Cuprite
class Driver
def current_memory
nil
end
end
end
2 changes: 1 addition & 1 deletion lib/kimurai/capybara_ext/mechanize/driver.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
require_relative '../driver/base'

class Capybara::Mechanize::Driver
# Extend capybara-mechnize to support Poltergeist-like methods
# Extend capybara-mechanize to support Poltergeist-like methods
# https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver

def set_proxy(ip, port, type, user = nil, password = nil)
Expand Down
2 changes: 1 addition & 1 deletion lib/kimurai/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module Kimurai
VERSION = "1.4.0"
VERSION = "1.5"
end