Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement req_perform_spider() #584

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions R/spider.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#' @param hash_key A function with argument `req` that returns the components
#' of the request that should be used for computing equality. By default,
#' `hash_key` inspects the `url`, `body`, and `headers`, which should be
#' adequate for most needs.
#' @param progress_label A 1function with `req` that returns a string used to
#' label the progress bar. The default displays the URL which is most useful
#' for spidering HTML sites.
#' @examples
#' url <- "https://ggplot2.tidyverse.org/"
#' req <- request(url)
#' req_perform_spider(req, next_reqs = spider_descendents(url))
req_perform_spider <- function(
req,
next_reqs,
path = NULL,
on_error = c("stop", "return", "continue"),
hash_key = NULL,
progress = TRUE,
progress_label = NULL
) {

check_request(req)
check_function2(next_reqs, args = c("resp", "req"))
check_string(path, allow_empty = FALSE, allow_null = TRUE)
on_error <- match.arg(on_error)
check_function2(hash_key, args = "req", allow_null = TRUE)
check_function2(progress_label, args = "req", allow_null = TRUE)
check_bool(progress)

Check warning on line 28 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L22-L28

Added lines #L22 - L28 were not covered by tests

hash_key <- hash_key %||% function(req) req[c("url", "body", "headers")]
progress_label <- progress_label %||% function(req) req$url

Check warning on line 31 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L30-L31

Added lines #L30 - L31 were not covered by tests

get_path <- function(hash) {
if (is.null(path)) {
NULL
} else {
glue::glue(path)
}
}

Check warning on line 39 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L33-L39

Added lines #L33 - L39 were not covered by tests

todo <- fastmap::fastqueue()
done <- fastmap::fastmap()
seen <- fastmap::fastmap()

Check warning on line 43 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L41-L43

Added lines #L41 - L43 were not covered by tests

todo$add(req)

Check warning on line 45 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L45

Added line #L45 was not covered by tests

if (progress) {
cli::cli_progress_bar(
type = "custom",
total = NA,
format = "Spidering {done$size()}/{done$size() + todo$size()}: {progress_label(req)}"
)

Check warning on line 52 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L47-L52

Added lines #L47 - L52 were not covered by tests
}

while (todo$size() > 0) {
req <- todo$remove()
if (progress) cli::cli_progress_update()

Check warning on line 57 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L55-L57

Added lines #L55 - L57 were not covered by tests

req_hash <- hash(hash_key(req))
resp <- req_perform(req, path = get_path(req_hash))
done$set(req_hash, resp)
seen$set(req_hash, TRUE)

Check warning on line 62 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L59-L62

Added lines #L59 - L62 were not covered by tests

up_next <- next_reqs(req, resp)
for (req in up_next) {
req_hash <- hash(hash_key(req))
if (!seen$has(req_hash)) {
seen$set(req_hash, TRUE)
todo$add(req)
}
}
}

Check warning on line 72 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L64-L72

Added lines #L64 - L72 were not covered by tests

unname(done$as_list())

Check warning on line 74 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L74

Added line #L74 was not covered by tests
}


#' @export
#' @rdname req_perform_spider
spider_descendents <- function(home_url) {
force(home_url)
function(req, resp) {
html <- resp_body_html(resp)

Check warning on line 83 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L81-L83

Added lines #L81 - L83 were not covered by tests

a <- xml2::xml_find_all(html, "//a[@href]")
href <- xml2::xml_attr(a, "href")
href <- xml2::url_absolute(href, resp_url(resp))
href <- href[map_lgl(href, can_parse)]
href <- map_chr(href, strip_fragment)
href <- unique(href)

Check warning on line 90 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L85-L90

Added lines #L85 - L90 were not covered by tests

descendents <- href[map_lgl(href, url_is_child, home_url)]

Check warning on line 92 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L92

Added line #L92 was not covered by tests

map(descendents, function(path) req_url(req, path))
}

Check warning on line 95 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L94-L95

Added lines #L94 - L95 were not covered by tests
}

url_is_child <- function(child, parent) {
parent <- url_parse(parent)
child <- url_parse(child)

Check warning on line 100 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L99-L100

Added lines #L99 - L100 were not covered by tests

identical(child$scheme, parent$scheme) &&
identical(child$hostname, parent$hostname) &&
identical(child$port, parent$port) &&
path_is_child(child$path, parent$path)

Check warning on line 105 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L102-L105

Added lines #L102 - L105 were not covered by tests
}

# path_is_child("/foo2", "/foo")
# path_is_child("/foo/bar", "/foo")
path_is_child <- function(child, parent) {
parent <- normalize_path(parent)
child <- normalize_path(child)

Check warning on line 112 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L111-L112

Added lines #L111 - L112 were not covered by tests

if (startsWith(child, parent)) {
if (nchar(child) > nchar(parent)) {
i <- nchar(parent) + 1
substring(child, i, i) == "/"

Check warning on line 117 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L114-L117

Added lines #L114 - L117 were not covered by tests
} else {
FALSE

Check warning on line 119 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L119

Added line #L119 was not covered by tests
}
} else {
FALSE

Check warning on line 122 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L122

Added line #L122 was not covered by tests
}
}

normalize_path <- function(path) {
# strip index.html and friends
path <- sub("(index|default)\\.[a-z]+$", "", path, ignore.case = TRUE)

Check warning on line 128 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L128

Added line #L128 was not covered by tests
# strip trailing /
path <- sub("/$", "", path)

Check warning on line 130 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L130

Added line #L130 was not covered by tests
# url_parse ensures it always starts with /
path

Check warning on line 132 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L132

Added line #L132 was not covered by tests
}

strip_fragment <- function(url) {
url <- url_parse(url)
url$fragment <- NULL
url_build(url)

Check warning on line 138 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L136-L138

Added lines #L136 - L138 were not covered by tests
}

can_parse <- function(url) {
tryCatch(
{
url_parse(url)
TRUE
},
error = function(cnd) FALSE
)

Check warning on line 148 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L142-L148

Added lines #L142 - L148 were not covered by tests
}
Loading