Skip to content

Commit

Permalink
Significant refactoring
Browse files Browse the repository at this point in the history
including extra config settings, a proper rate limit, and a logger. Fixes: hartator#307 hartator#291 hartator#281 hartator#269 and probably others too
  • Loading branch information
StrawberryMaster committed Dec 3, 2024
1 parent eaff48f commit 45fa2be
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 69 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
source "https://rubygems.org"

gemspec
gem 'concurrent-ruby'
221 changes: 152 additions & 69 deletions lib/wayback_machine_downloader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
require 'cgi'
require 'json'
require 'time'
require 'concurrent'
require 'logger'
require_relative 'wayback_machine_downloader/tidy_bytes'
require_relative 'wayback_machine_downloader/to_regex'
require_relative 'wayback_machine_downloader/archive_api'
Expand All @@ -16,12 +18,20 @@ class WaybackMachineDownloader
include ArchiveAPI

VERSION = "2.3.2"
DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3
RETRY_DELAY = 2
RATE_LIMIT = 0.25 # Delay between requests in seconds
CONNECTION_POOL_SIZE = 10
HTTP_CACHE_SIZE = 1000
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks

attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
:all, :maximum_pages, :threads_count
:all, :maximum_pages, :threads_count, :logger

def initialize params
validate_params(params)
@base_url = params[:base_url]
@exact_url = params[:exact_url]
@directory = params[:directory]
Expand All @@ -32,7 +42,11 @@ def initialize params
@exclude_filter = params[:exclude_filter]
@all = params[:all]
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = params[:threads_count].to_i
@threads_count = [params[:threads_count].to_i, 1].max # Garante mínimo de 1 thread
@timeout = params[:timeout] || DEFAULT_TIMEOUT
@logger = setup_logger
@http_cache = Concurrent::Map.new
@failed_downloads = Concurrent::Array.new
end

def backup_name
Expand Down Expand Up @@ -82,28 +96,30 @@ def match_exclude_filter file_url
end

def get_all_snapshots_to_consider
http = Net::HTTP.new("web.archive.org", 443)
http.use_ssl = true

http = setup_http_client
snapshot_list_to_consider = []

http.start do
puts "Getting snapshot pages"
begin
http.start do |connection|
puts "Getting snapshot pages"

# Fetch the initial set of snapshots
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
print "."
# Fetch the initial set of snapshots
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
print "."

# Fetch additional pages if the exact URL flag is not set
unless @exact_url
@maximum_pages.times do |page_index|
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, http)
break if snapshot_list.empty?
# Fetch additional pages if the exact URL flag is not set
unless @exact_url
@maximum_pages.times do |page_index|
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
break if snapshot_list.empty?

snapshot_list_to_consider += snapshot_list
print "."
snapshot_list_to_consider += snapshot_list
print "."
end
end
end
ensure
http.finish if http.started?
end

puts " found #{snapshot_list_to_consider.length} snapshots to consider."
Expand Down Expand Up @@ -199,46 +215,49 @@ def list_files
def download_files
start_time = Time.now
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
puts


if file_list_by_timestamp.empty?
puts "No files to download."
puts "Possible reasons:"
puts "\t* Site is not in Wayback Machine Archive."
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
return
end

puts "#{file_list_by_timestamp.count} files to download:"

threads = []
mutex = Mutex.new
total_files = file_list_by_timestamp.count
puts "#{total_files} files to download:"

@processed_file_count = 0
@threads_count = 1 unless @threads_count != 0
@threads_count.times do
threads << Thread.new do
http = Net::HTTP.new("web.archive.org", 443)
http.use_ssl = true

@download_mutex = Mutex.new

thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
pool = Concurrent::FixedThreadPool.new(thread_count)
semaphore = Concurrent::Semaphore.new(CONNECTION_POOL_SIZE)

file_list_by_timestamp.each do |file_remote_info|
pool.post do
semaphore.acquire
http = nil
begin
until file_queue.empty?
file_remote_info = nil
mutex.synchronize { file_remote_info = file_queue.pop(true) rescue nil }
download_file(file_remote_info, http) if file_remote_info
http = setup_http_client
http.start do |connection|
result = download_file(file_remote_info, connection)
@download_mutex.synchronize do
@processed_file_count += 1
puts result if result
end
end
ensure
http.finish if http.started?
semaphore.release
http&.finish if http&.started?
sleep(RATE_LIMIT)
end
end
end

threads.each(&:join)
pool.shutdown
pool.wait_for_termination

end_time = Time.now
puts
puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
cleanup
end

def structure_dir_path dir_path
Expand Down Expand Up @@ -288,38 +307,18 @@ def download_file (file_remote_info, http)
unless File.exist? file_path
begin
structure_dir_path dir_path
open(file_path, "wb") do |file|
begin
http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
file.write(body)
end
rescue OpenURI::HTTPError => e
puts "#{file_url} # #{e}"
if @all
file.write(e.io.read)
puts "#{file_path} saved anyway."
end
rescue StandardError => e
puts "#{file_url} # #{e}"
end
end
download_with_retry(file_path, file_url, file_timestamp, http)
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
rescue StandardError => e
puts "#{file_url} # #{e}"
ensure
msg = "#{file_url} # #{e}"
if not @all and File.exist?(file_path) and File.size(file_path) == 0
File.delete(file_path)
puts "#{file_path} was empty and was removed."
msg += "\n#{file_path} was empty and was removed."
end
end
semaphore.synchronize do
@processed_file_count += 1
puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
msg
end
else
semaphore.synchronize do
@processed_file_count += 1
puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
end
"#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
end
end

Expand All @@ -334,4 +333,88 @@ def file_list_by_timestamp
def semaphore
@semaphore ||= Mutex.new
end

private

def validate_params(params)
raise ArgumentError, "Base URL is required" unless params[:base_url]
raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
# Removida validação de threads_count pois agora é forçado a ser positivo
end

def setup_logger
logger = Logger.new(STDOUT)
logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
logger.formatter = proc do |severity, datetime, progname, msg|
"#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
end
logger
end

def setup_http_client
cached_client = @http_cache[Thread.current.object_id]
return cached_client if cached_client&.active?

http = Net::HTTP.new("web.archive.org", 443)
http.use_ssl = true
http.read_timeout = @timeout
http.open_timeout = @timeout
http.keep_alive_timeout = 30
http.max_retries = MAX_RETRIES

@http_cache[Thread.current.object_id] = http
http
end

def download_with_retry(file_path, file_url, file_timestamp, connection)
retries = 0
begin
request = Net::HTTP::Get.new(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"))
request["Connection"] = "keep-alive"
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"

response = connection.request(request)

case response
when Net::HTTPSuccess
File.open(file_path, "wb") do |file|
if block_given?
yield(response, file)
else
file.write(response.body)
end
end
when Net::HTTPTooManyRequests
sleep(RATE_LIMIT * 2)
raise "Rate limited, retrying..."
else
raise "HTTP Error: #{response.code} #{response.message}"
end

rescue StandardError => e
if retries < MAX_RETRIES
retries += 1
@logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
sleep(RETRY_DELAY * retries)
retry
else
@failed_downloads << {url: file_url, error: e.message}
raise e
end
end
end

def cleanup
@http_cache.each_value do |client|
client.finish if client&.started?
end
@http_cache.clear

if @failed_downloads.any?
@logger.error("Failed downloads summary:")
@failed_downloads.each do |failure|
@logger.error(" #{failure[:url]} - #{failure[:error]}")
end
end
end
end

0 comments on commit 45fa2be

Please sign in to comment.