From 0b901504c8385ea2ec80018ced2449f3665355e1 Mon Sep 17 00:00:00 2001 From: JingYu Ning Date: Wed, 1 Sep 2021 12:35:30 +0800 Subject: [PATCH 1/5] support url for kaggle --- src/kaggle.jl | 15 ++++++++++++++- test/kaggle.jl | 26 +++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/kaggle.jl b/src/kaggle.jl index 017da6e..b6d6206 100644 --- a/src/kaggle.jl +++ b/src/kaggle.jl @@ -1,5 +1,7 @@ export kdownload +const KAGGLE_DOMAIN = "www.kaggle.com" + struct Auth username::String key::String @@ -27,7 +29,18 @@ function gen_kaggle_url(dataset) return "https://www.kaggle.com/api/v1/datasets/download/$dataset" end -function kdownload(dataset, localdir) +is_kaggle_url(url) = contains(url, KAGGLE_DOMAIN) + +function kaggle_url2dataset(url) + user_name = match(Regex("$KAGGLE_DOMAIN/([^\\/]+)/"), url).captures[] + dataset_name = match(Regex("$user_name/([^\\/]+)"), url).captures[] + + return "$user_name/$dataset_name" +end + +function kdownload(url_or_dataset, localdir) + dataset = is_kaggle_url(url_or_dataset) ? kaggle_url2dataset(url_or_dataset) : url_or_dataset + url = gen_kaggle_url(dataset) filepath = joinpath(localdir, "$(replace(dataset, '/'=>'_')).zip") diff --git a/test/kaggle.jl b/test/kaggle.jl index c138b49..b2fc83e 100644 --- a/test/kaggle.jl +++ b/test/kaggle.jl @@ -1,4 +1,4 @@ -@testset "kaggle" begin +@testset "kaggle dataset" begin dataset = "ningjingyu/fetchtest" f = kdownload(dataset, pwd()) @@ -10,3 +10,27 @@ rm(joinpath(pwd(), "FetchTest"), recursive=true, force=true) end + +@testset "kaggle url" begin + urls = [ + "https://www.kaggle.com/ningjingyu/fetchtest", + "https://www.kaggle.com/ningjingyu/fetchtest/tasks", + "https://www.kaggle.com/ningjingyu/fetchtest/code", + "https://www.kaggle.com/ningjingyu/fetchtest/discussion", + "https://www.kaggle.com/ningjingyu/fetchtest/activity", + "https://www.kaggle.com/ningjingyu/fetchtest/metadata", + "https://www.kaggle.com/ningjingyu/fetchtest/settings", + ] + + for url in urls + f = kdownload(url, pwd()) + DataDeps.unpack(f) + + open(joinpath(pwd(), "FetchTest", "FetchTest.txt"), "r") do file + @test readline(file) == "Test" + end + + rm(joinpath(pwd(), "FetchTest"), recursive=true, force=true) + end +end + \ No newline at end of file From 20153a40daf3f6fc135805e7b49da48f9601f720 Mon Sep 17 00:00:00 2001 From: JingYu Ning Date: Wed, 1 Sep 2021 12:40:51 +0800 Subject: [PATCH 2/5] update readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 8635c8f..9c2180d 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,14 @@ dataset = "ningjingyu/fetchtest" kdownload(dataset, pwd()) ``` +And via url as well + +```julia +using Fetch +url = "https://www.kaggle.com/ningjingyu/fetchtest" +kdownload(url, pwd()) +``` + ## Intergrate with DataDeps.jl According to [DataDeps.jl](https://github.com/oxinabox/DataDeps.jl), From f89bbba4bde5d04229026c876cb326030fcd3009 Mon Sep 17 00:00:00 2001 From: JingYu Ning Date: Wed, 1 Sep 2021 13:36:57 +0800 Subject: [PATCH 3/5] refactor --- src/kaggle.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/kaggle.jl b/src/kaggle.jl index b6d6206..105461f 100644 --- a/src/kaggle.jl +++ b/src/kaggle.jl @@ -32,8 +32,7 @@ end is_kaggle_url(url) = contains(url, KAGGLE_DOMAIN) function kaggle_url2dataset(url) - user_name = match(Regex("$KAGGLE_DOMAIN/([^\\/]+)/"), url).captures[] - dataset_name = match(Regex("$user_name/([^\\/]+)"), url).captures[] + user_name, dataset_name = match(Regex("$KAGGLE_DOMAIN/([^/]+)/([^/]+)"), url).captures return "$user_name/$dataset_name" end From a0ed78ca240f99d02cef5ae48cdcddffbcf32fcb Mon Sep 17 00:00:00 2001 From: JingYu Ning Date: Wed, 1 Sep 2021 14:09:00 +0800 Subject: [PATCH 4/5] check ill-from or non-kaggle url --- src/kaggle.jl | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/kaggle.jl b/src/kaggle.jl index 105461f..2043c78 100644 --- a/src/kaggle.jl +++ b/src/kaggle.jl @@ -1,6 +1,7 @@ export kdownload const KAGGLE_DOMAIN = "www.kaggle.com" +const KAGGLE_API = "https://www.kaggle.com/api/v1/datasets/download" struct Auth username::String @@ -26,19 +27,24 @@ function gen_auth_key() end function gen_kaggle_url(dataset) - return "https://www.kaggle.com/api/v1/datasets/download/$dataset" + return "$KAGGLE_API/$dataset" end -is_kaggle_url(url) = contains(url, KAGGLE_DOMAIN) +function kaggle_url2dataset(url_or_dataset) + if contains(url_or_dataset, KAGGLE_DOMAIN) + user_name, dataset_name = match(Regex("$KAGGLE_DOMAIN/([^/]+)/([^/]+)"), url_or_dataset).captures + dataset = "$user_name/$dataset_name" + else + dataset = url_or_dataset + end -function kaggle_url2dataset(url) - user_name, dataset_name = match(Regex("$KAGGLE_DOMAIN/([^/]+)/([^/]+)"), url).captures + @assert HTTP.request("HEAD", "https://$KAGGLE_DOMAIN/$dataset").status == 200 - return "$user_name/$dataset_name" + return dataset end function kdownload(url_or_dataset, localdir) - dataset = is_kaggle_url(url_or_dataset) ? kaggle_url2dataset(url_or_dataset) : url_or_dataset + dataset = kaggle_url2dataset(url_or_dataset) url = gen_kaggle_url(dataset) filepath = joinpath(localdir, "$(replace(dataset, '/'=>'_')).zip") From f3ee25039d9347c9d84eac912b17680acc3416dc Mon Sep 17 00:00:00 2001 From: JingYu Ning Date: Wed, 1 Sep 2021 14:10:34 +0800 Subject: [PATCH 5/5] fix style --- test/kaggle.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/kaggle.jl b/test/kaggle.jl index b2fc83e..e7571e8 100644 --- a/test/kaggle.jl +++ b/test/kaggle.jl @@ -33,4 +33,3 @@ end rm(joinpath(pwd(), "FetchTest"), recursive=true, force=true) end end - \ No newline at end of file