From 5bf323096787a152743efd4856194f9de612f00e Mon Sep 17 00:00:00 2001 From: postables Date: Fri, 5 Jun 2020 14:54:24 -0700 Subject: [PATCH 01/18] tools/replicator: adds WIP data replicator --- tools/replicator/README.md | 24 +++ .../configs/follow_peer/service.json | 138 ++++++++++++++++++ .../configs/trusted_peer/service.json | 138 ++++++++++++++++++ tools/replicator/install_cluster_linux.sh | 25 ++++ 4 files changed, 325 insertions(+) create mode 100644 tools/replicator/README.md create mode 100644 tools/replicator/configs/follow_peer/service.json create mode 100644 tools/replicator/configs/trusted_peer/service.json create mode 100755 tools/replicator/install_cluster_linux.sh diff --git a/tools/replicator/README.md b/tools/replicator/README.md new file mode 100644 index 000000000..bf4c9f8ba --- /dev/null +++ b/tools/replicator/README.md @@ -0,0 +1,24 @@ +# replicator + +> **WARNING**: If you value your privacy, and anonymity do not participate in this cluster. Participating in this cluster, **even** if coming behind an anonymization tool like Tor, or a VPN will likely lead to your real life identity being revealed, as IPFS is incredibly self-doxxing. If you want to participate in this cluster, and value your privacy and anonymity, do so from a cloud based VPS, ideally paid for via anonymous crypto. + +The `replicator` tool allows anyone to easily mirror a public set of data on IPFS. It consists of spinning up lightweight an IPFS node, along with a lightweight IPFS Cluster client that follows a CRDT topic publish to by a set of trusted peers that are responsible for updating the "follow list" which is a set of IPFS CIDs that are replicated by the cluster. Anyone following this cluster will pin the data in the follow list locally. + +# install + +If you run a 64-bit version of linux you can use the `install_cluster_linux.sh` bash script to install the needed components. If you don't run a 64-bit version of Linux, you should update the script to work on your platform and then use the script, otherwise please see the following URLS: + +* [ipfs-cluster-ctl](https://dist.ipfs.io/#ipfs-cluster-ctl) +* [ipfs-cluster-follow](https://dist.ipfs.io/#ipfs-cluster-follow) +* [ipfs-cluster-service](https://dist.ipfs.io/#ipfs-cluster-service) + + +To install the cluster tooling on 64-bit linux with the aforementioned script invoke as follows: + +```shell +$> install_cluster_linux.sh linux-64bit +``` + +# usage + +This folder contains the needed files and configurations for anyone to start a follow peer, or run their own follow cluster acting as a trusted peer. The trusted peer setup is a little more difficult, and requires running both go-ipfs and ipfs-cluster. \ No newline at end of file diff --git a/tools/replicator/configs/follow_peer/service.json b/tools/replicator/configs/follow_peer/service.json new file mode 100644 index 000000000..e57e9a917 --- /dev/null +++ b/tools/replicator/configs/follow_peer/service.json @@ -0,0 +1,138 @@ +{ + "cluster": { + "peername": "2020pb-dataset", + "secret": "7ea8e27fabf85dda89da620d61d8701b34649fce1e83e22a72dea60e5dd262b5", + "leave_on_shutdown": false, + "listen_multiaddress": [ + "/ip4/0.0.0.0/tcp/9096", + "/ip4/0.0.0.0/udp/9096/quic" + ], + "enable_relay_hop": true, + "connection_manager": { + "high_water": 400, + "low_water": 100, + "grace_period": "2m0s" + }, + "state_sync_interval": "5m0s", + "pin_recover_interval": "12m0s", + "replication_factor_min": -1, + "replication_factor_max": -1, + "monitor_ping_interval": "15s", + "peer_watch_interval": "5s", + "mdns_interval": "10s", + "disable_repinning": false, + "peer_addresses": [] + }, + "consensus": { + "crdt": { + "cluster_name": "cord19-dataset", + "trusted_peers": [ + "12D3KooWPT1tJCyZ1zW35Zt7gMRYwTP2J3hmTpCV6a4m7SjmwNUo" + ] + } + }, + "api": { + "ipfsproxy": { + "listen_multiaddress": "/ip4/127.0.0.1/tcp/9095", + "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", + "log_file": "", + "read_timeout": "0s", + "read_header_timeout": "5s", + "write_timeout": "0s", + "idle_timeout": "1m0s", + "max_header_bytes": 4096 + }, + "restapi": { + "http_listen_multiaddress": "/ip4/127.0.0.1/tcp/9094", + "read_timeout": "0s", + "read_header_timeout": "5s", + "write_timeout": "0s", + "idle_timeout": "2m0s", + "max_header_bytes": 4096, + "basic_auth_credentials": null, + "http_log_file": "", + "headers": {}, + "cors_allowed_origins": [ + "*" + ], + "cors_allowed_methods": [ + "GET" + ], + "cors_allowed_headers": [], + "cors_exposed_headers": [ + "Content-Type", + "X-Stream-Output", + "X-Chunked-Output", + "X-Content-Length" + ], + "cors_allow_credentials": true, + "cors_max_age": "0s" + } + }, + "ipfs_connector": { + "ipfshttp": { + "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", + "connect_swarms_delay": "30s", + "ipfs_request_timeout": "5m0s", + "pin_timeout": "24h0m0s", + "unpin_timeout": "3h0m0s", + "repogc_timeout": "24h0m0s" + } + }, + "pin_tracker": { + "stateless": { + "concurrent_pins": 10 + } + }, + "monitor": { + "pubsubmon": { + "check_interval": "15s", + "failure_threshold": 3 + } + }, + "informer": { + "disk": { + "metric_ttl": "30s", + "metric_type": "freespace" + } + }, + "observations": { + "metrics": { + "enable_stats": false, + "prometheus_endpoint": "/ip4/127.0.0.1/tcp/8888", + "reporting_interval": "2s" + }, + "tracing": { + "enable_tracing": false, + "jaeger_agent_endpoint": "/ip4/0.0.0.0/udp/6831", + "sampling_prob": 0.3, + "service_name": "cluster-daemon" + } + }, + "datastore": { + "badger": { + "badger_options": { + "dir": "", + "value_dir": "", + "sync_writes": true, + "table_loading_mode": 2, + "value_log_loading_mode": 2, + "num_versions_to_keep": 1, + "max_table_size": 67108864, + "level_size_multiplier": 10, + "max_levels": 7, + "value_threshold": 32, + "num_memtables": 5, + "num_level_zero_tables": 5, + "num_level_zero_tables_stall": 10, + "level_one_size": 268435456, + "value_log_file_size": 1073741823, + "value_log_max_entries": 1000000, + "num_compactors": 2, + "compact_l_0_on_close": false, + "read_only": false, + "truncate": false + } + } + } + } \ No newline at end of file diff --git a/tools/replicator/configs/trusted_peer/service.json b/tools/replicator/configs/trusted_peer/service.json new file mode 100644 index 000000000..e57e9a917 --- /dev/null +++ b/tools/replicator/configs/trusted_peer/service.json @@ -0,0 +1,138 @@ +{ + "cluster": { + "peername": "2020pb-dataset", + "secret": "7ea8e27fabf85dda89da620d61d8701b34649fce1e83e22a72dea60e5dd262b5", + "leave_on_shutdown": false, + "listen_multiaddress": [ + "/ip4/0.0.0.0/tcp/9096", + "/ip4/0.0.0.0/udp/9096/quic" + ], + "enable_relay_hop": true, + "connection_manager": { + "high_water": 400, + "low_water": 100, + "grace_period": "2m0s" + }, + "state_sync_interval": "5m0s", + "pin_recover_interval": "12m0s", + "replication_factor_min": -1, + "replication_factor_max": -1, + "monitor_ping_interval": "15s", + "peer_watch_interval": "5s", + "mdns_interval": "10s", + "disable_repinning": false, + "peer_addresses": [] + }, + "consensus": { + "crdt": { + "cluster_name": "cord19-dataset", + "trusted_peers": [ + "12D3KooWPT1tJCyZ1zW35Zt7gMRYwTP2J3hmTpCV6a4m7SjmwNUo" + ] + } + }, + "api": { + "ipfsproxy": { + "listen_multiaddress": "/ip4/127.0.0.1/tcp/9095", + "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", + "log_file": "", + "read_timeout": "0s", + "read_header_timeout": "5s", + "write_timeout": "0s", + "idle_timeout": "1m0s", + "max_header_bytes": 4096 + }, + "restapi": { + "http_listen_multiaddress": "/ip4/127.0.0.1/tcp/9094", + "read_timeout": "0s", + "read_header_timeout": "5s", + "write_timeout": "0s", + "idle_timeout": "2m0s", + "max_header_bytes": 4096, + "basic_auth_credentials": null, + "http_log_file": "", + "headers": {}, + "cors_allowed_origins": [ + "*" + ], + "cors_allowed_methods": [ + "GET" + ], + "cors_allowed_headers": [], + "cors_exposed_headers": [ + "Content-Type", + "X-Stream-Output", + "X-Chunked-Output", + "X-Content-Length" + ], + "cors_allow_credentials": true, + "cors_max_age": "0s" + } + }, + "ipfs_connector": { + "ipfshttp": { + "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", + "connect_swarms_delay": "30s", + "ipfs_request_timeout": "5m0s", + "pin_timeout": "24h0m0s", + "unpin_timeout": "3h0m0s", + "repogc_timeout": "24h0m0s" + } + }, + "pin_tracker": { + "stateless": { + "concurrent_pins": 10 + } + }, + "monitor": { + "pubsubmon": { + "check_interval": "15s", + "failure_threshold": 3 + } + }, + "informer": { + "disk": { + "metric_ttl": "30s", + "metric_type": "freespace" + } + }, + "observations": { + "metrics": { + "enable_stats": false, + "prometheus_endpoint": "/ip4/127.0.0.1/tcp/8888", + "reporting_interval": "2s" + }, + "tracing": { + "enable_tracing": false, + "jaeger_agent_endpoint": "/ip4/0.0.0.0/udp/6831", + "sampling_prob": 0.3, + "service_name": "cluster-daemon" + } + }, + "datastore": { + "badger": { + "badger_options": { + "dir": "", + "value_dir": "", + "sync_writes": true, + "table_loading_mode": 2, + "value_log_loading_mode": 2, + "num_versions_to_keep": 1, + "max_table_size": 67108864, + "level_size_multiplier": 10, + "max_levels": 7, + "value_threshold": 32, + "num_memtables": 5, + "num_level_zero_tables": 5, + "num_level_zero_tables_stall": 10, + "level_one_size": 268435456, + "value_log_file_size": 1073741823, + "value_log_max_entries": 1000000, + "num_compactors": 2, + "compact_l_0_on_close": false, + "read_only": false, + "truncate": false + } + } + } + } \ No newline at end of file diff --git a/tools/replicator/install_cluster_linux.sh b/tools/replicator/install_cluster_linux.sh new file mode 100755 index 000000000..c56e89c71 --- /dev/null +++ b/tools/replicator/install_cluster_linux.sh @@ -0,0 +1,25 @@ +#! /bin/bash + +# yoinked from https://github.com/RTradeLtd/cord19-collaborative-cluster/blob/master/scripts/install_cluster.sh +# multi-platform cluster download script, only supports linux 64-bit right now + +VERSION="v0.12.1" +OS="" +case "$1" in + linux-64bit) + OS="linux-amd64" + wget "https://dist.ipfs.io/ipfs-cluster-service/${VERSION}/ipfs-cluster-service_${VERSION}_${OS}.tar.gz" + wget "https://dist.ipfs.io/ipfs-cluster-ctl/${VERSION}/ipfs-cluster-ctl_${VERSION}_${OS}.tar.gz" + wget "https://dist.ipfs.io/ipfs-cluster-follow/${VERSION}/ipfs-cluster-follow_${VERSION}_${OS}.tar.gz" + tar zxvf "ipfs-cluster-service_${VERSION}_${OS}.tar.gz" + tar zxvf "ipfs-cluster-ctl_${VERSION}_${OS}.tar.gz" + tar zxvf "ipfs-cluster-follow_${VERSION}_${OS}.tar.gz" + (cd ipfs-cluster-service && sudo cp ipfs-cluster-service /usr/local/bin) + (cd ipfs-cluster-ctl && sudo cp ipfs-cluster-ctl /usr/local/bin) + (cd ipfs-cluster-follow && sudo cp ipfs-cluster-follow /usr/local/bin) + ;; + *) + echo "unsupported os" + exit 2 + ;; +esac \ No newline at end of file From 999573553ec19e451222f3cd24338372fd83507d Mon Sep 17 00:00:00 2001 From: postables Date: Fri, 5 Jun 2020 15:44:26 -0700 Subject: [PATCH 02/18] tools/replicator: add trusted and follower configurations --- tools/replicator/Makefile | 4 + tools/replicator/README.md | 29 ++- .../configs/follow_peer/service.json | 231 ++++++++---------- .../configs/trusted_peer/service.json | 16 +- .../{ => scripts}/install_cluster_linux.sh | 2 + 5 files changed, 140 insertions(+), 142 deletions(-) create mode 100644 tools/replicator/Makefile rename tools/replicator/{ => scripts}/install_cluster_linux.sh (92%) diff --git a/tools/replicator/Makefile b/tools/replicator/Makefile new file mode 100644 index 000000000..a36a548cc --- /dev/null +++ b/tools/replicator/Makefile @@ -0,0 +1,4 @@ +.PHONY: cluster-first-start +cluster-first-start: + ipfs-cluster-service init --consensus crdt + ipfs-cluster-service daemon \ No newline at end of file diff --git a/tools/replicator/README.md b/tools/replicator/README.md index bf4c9f8ba..033ab6871 100644 --- a/tools/replicator/README.md +++ b/tools/replicator/README.md @@ -21,4 +21,31 @@ $> install_cluster_linux.sh linux-64bit # usage -This folder contains the needed files and configurations for anyone to start a follow peer, or run their own follow cluster acting as a trusted peer. The trusted peer setup is a little more difficult, and requires running both go-ipfs and ipfs-cluster. \ No newline at end of file +This folder contains the needed files and configurations for anyone to start a follow peer, or run their own follow cluster acting as a trusted peer. The trusted peer setup is a little more difficult, and requires running both go-ipfs and ipfs-cluster. + +## trusted peer + +> [for more information click this link](https://cluster.ipfs.io/documentation/collaborative/setup/) + +If using a fresh install of IPFS cluster, you will want to run the following two commands which which initialize the configuration and start the daemon for the first time. This will print some important information that you should note, namely: + + * Generated cluster secret + * Peer ID to use as a trusted peer + * The multiaddress on which it will be reachable by others + +First ensure that you have a valid go-ipfs instance up and running on the machine you are going to use, and run the following two commands: + +```shell +$> ipfs-cluster-service init --consensus crdt +$> ipfs-cluster-service daemon +``` + +## follow peer + +First ensure that you valid a valid go-ipfs instance up and running on the machine you are rusing, and run the following command: + +```shell +$> ipfs-cluster-follow 2020pb-dataset run --init 2020pb.temporal.cloud +``` + +This will start the cluster follow peer and being replicating the cluster data \ No newline at end of file diff --git a/tools/replicator/configs/follow_peer/service.json b/tools/replicator/configs/follow_peer/service.json index e57e9a917..9f71d7a31 100644 --- a/tools/replicator/configs/follow_peer/service.json +++ b/tools/replicator/configs/follow_peer/service.json @@ -1,138 +1,103 @@ { - "cluster": { - "peername": "2020pb-dataset", - "secret": "7ea8e27fabf85dda89da620d61d8701b34649fce1e83e22a72dea60e5dd262b5", - "leave_on_shutdown": false, - "listen_multiaddress": [ - "/ip4/0.0.0.0/tcp/9096", - "/ip4/0.0.0.0/udp/9096/quic" - ], - "enable_relay_hop": true, - "connection_manager": { - "high_water": 400, - "low_water": 100, - "grace_period": "2m0s" - }, - "state_sync_interval": "5m0s", - "pin_recover_interval": "12m0s", - "replication_factor_min": -1, - "replication_factor_max": -1, - "monitor_ping_interval": "15s", - "peer_watch_interval": "5s", - "mdns_interval": "10s", - "disable_repinning": false, - "peer_addresses": [] + "cluster": { + "follower_mode": true, + "peername": "2020pb-follower", + "secret": "91125f520ba285491b9bd1b62e26f784f8fd9577e5348171382d7c72127168f2", + "leave_on_shutdown": false, + "listen_multiaddress": [ + "/ip4/0.0.0.0/tcp/9096", + "/ip4/0.0.0.0/udp/9096/quic" + ], + "enable_relay_hop": true, + "connection_manager": { + "high_water": 400, + "low_water": 100, + "grace_period": "2m0s" }, - "consensus": { - "crdt": { - "cluster_name": "cord19-dataset", - "trusted_peers": [ - "12D3KooWPT1tJCyZ1zW35Zt7gMRYwTP2J3hmTpCV6a4m7SjmwNUo" - ] - } - }, - "api": { - "ipfsproxy": { - "listen_multiaddress": "/ip4/127.0.0.1/tcp/9095", - "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", - "log_file": "", - "read_timeout": "0s", - "read_header_timeout": "5s", - "write_timeout": "0s", - "idle_timeout": "1m0s", - "max_header_bytes": 4096 - }, - "restapi": { - "http_listen_multiaddress": "/ip4/127.0.0.1/tcp/9094", - "read_timeout": "0s", - "read_header_timeout": "5s", - "write_timeout": "0s", - "idle_timeout": "2m0s", - "max_header_bytes": 4096, - "basic_auth_credentials": null, - "http_log_file": "", - "headers": {}, - "cors_allowed_origins": [ - "*" - ], - "cors_allowed_methods": [ - "GET" - ], - "cors_allowed_headers": [], - "cors_exposed_headers": [ - "Content-Type", - "X-Stream-Output", - "X-Chunked-Output", - "X-Content-Length" - ], - "cors_allow_credentials": true, - "cors_max_age": "0s" - } - }, - "ipfs_connector": { - "ipfshttp": { - "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", - "connect_swarms_delay": "30s", - "ipfs_request_timeout": "5m0s", - "pin_timeout": "24h0m0s", - "unpin_timeout": "3h0m0s", - "repogc_timeout": "24h0m0s" - } - }, - "pin_tracker": { - "stateless": { - "concurrent_pins": 10 - } - }, - "monitor": { - "pubsubmon": { - "check_interval": "15s", - "failure_threshold": 3 - } - }, - "informer": { - "disk": { - "metric_ttl": "30s", - "metric_type": "freespace" - } - }, - "observations": { - "metrics": { - "enable_stats": false, - "prometheus_endpoint": "/ip4/127.0.0.1/tcp/8888", - "reporting_interval": "2s" - }, - "tracing": { - "enable_tracing": false, - "jaeger_agent_endpoint": "/ip4/0.0.0.0/udp/6831", - "sampling_prob": 0.3, - "service_name": "cluster-daemon" - } + "state_sync_interval": "5m0s", + "pin_recover_interval": "12m0s", + "replication_factor_min": -1, + "replication_factor_max": -1, + "monitor_ping_interval": "15s", + "peer_watch_interval": "5s", + "mdns_interval": "10s", + "disable_repinning": false, + "peer_addresses": [ + "/ip4/207.6.222.55/tcp/9097/p2p/12D3KooWLREvKqLLefpzADz6tHW1kSEUsdGpq8jJkLtCbQ5Srauh" + ] + }, + "consensus": { + "crdt": { + "cluster_name": "2020pb-dataset", + "trusted_peers": [ + "12D3KooWLREvKqLLefpzADz6tHW1kSEUsdGpq8jJkLtCbQ5Srauh" + ] + } + }, + "ipfs_connector": { + "ipfshttp": { + "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", + "connect_swarms_delay": "30s", + "ipfs_request_timeout": "5m0s", + "pin_timeout": "24h0m0s", + "unpin_timeout": "3h0m0s", + "repogc_timeout": "24h0m0s" + } + }, + "pin_tracker": { + "stateless": { + "concurrent_pins": 10 + } + }, + "monitor": { + "pubsubmon": { + "check_interval": "15s", + "failure_threshold": 3 + } + }, + "informer": { + "disk": { + "metric_ttl": "30s", + "metric_type": "freespace" + } + }, + "observations": { + "metrics": { + "enable_stats": false, + "prometheus_endpoint": "/ip4/127.0.0.1/tcp/8888", + "reporting_interval": "2s" }, - "datastore": { - "badger": { - "badger_options": { - "dir": "", - "value_dir": "", - "sync_writes": true, - "table_loading_mode": 2, - "value_log_loading_mode": 2, - "num_versions_to_keep": 1, - "max_table_size": 67108864, - "level_size_multiplier": 10, - "max_levels": 7, - "value_threshold": 32, - "num_memtables": 5, - "num_level_zero_tables": 5, - "num_level_zero_tables_stall": 10, - "level_one_size": 268435456, - "value_log_file_size": 1073741823, - "value_log_max_entries": 1000000, - "num_compactors": 2, - "compact_l_0_on_close": false, - "read_only": false, - "truncate": false - } + "tracing": { + "enable_tracing": false, + "jaeger_agent_endpoint": "/ip4/0.0.0.0/udp/6831", + "sampling_prob": 0.3, + "service_name": "cluster-daemon" + } + }, + "datastore": { + "badger": { + "badger_options": { + "dir": "", + "value_dir": "", + "sync_writes": true, + "table_loading_mode": 0, + "value_log_loading_mode": 0, + "num_versions_to_keep": 1, + "max_table_size": 67108864, + "level_size_multiplier": 10, + "max_levels": 7, + "value_threshold": 32, + "num_memtables": 5, + "num_level_zero_tables": 5, + "num_level_zero_tables_stall": 10, + "level_one_size": 268435456, + "value_log_file_size": 1073741823, + "value_log_max_entries": 1000000, + "num_compactors": 2, + "compact_l_0_on_close": false, + "read_only": false, + "truncate": false } } - } \ No newline at end of file + } +} \ No newline at end of file diff --git a/tools/replicator/configs/trusted_peer/service.json b/tools/replicator/configs/trusted_peer/service.json index e57e9a917..967079859 100644 --- a/tools/replicator/configs/trusted_peer/service.json +++ b/tools/replicator/configs/trusted_peer/service.json @@ -1,11 +1,11 @@ { "cluster": { - "peername": "2020pb-dataset", - "secret": "7ea8e27fabf85dda89da620d61d8701b34649fce1e83e22a72dea60e5dd262b5", + "peername": "2020pb-trusted-1", + "secret": "91125f520ba285491b9bd1b62e26f784f8fd9577e5348171382d7c72127168f2", "leave_on_shutdown": false, "listen_multiaddress": [ - "/ip4/0.0.0.0/tcp/9096", - "/ip4/0.0.0.0/udp/9096/quic" + "/ip4/0.0.0.0/tcp/9097", + "/ip4/0.0.0.0/udp/9097/quic" ], "enable_relay_hop": true, "connection_manager": { @@ -25,15 +25,15 @@ }, "consensus": { "crdt": { - "cluster_name": "cord19-dataset", + "cluster_name": "2020pb-dataset", "trusted_peers": [ - "12D3KooWPT1tJCyZ1zW35Zt7gMRYwTP2J3hmTpCV6a4m7SjmwNUo" + "12D3KooWLREvKqLLefpzADz6tHW1kSEUsdGpq8jJkLtCbQ5Srauh" ] } }, "api": { "ipfsproxy": { - "listen_multiaddress": "/ip4/127.0.0.1/tcp/9095", + "listen_multiaddress": "/ip4/127.0.0.1/tcp/9099", "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", "log_file": "", "read_timeout": "0s", @@ -43,7 +43,7 @@ "max_header_bytes": 4096 }, "restapi": { - "http_listen_multiaddress": "/ip4/127.0.0.1/tcp/9094", + "http_listen_multiaddress": "/ip4/127.0.0.1/tcp/9098", "read_timeout": "0s", "read_header_timeout": "5s", "write_timeout": "0s", diff --git a/tools/replicator/install_cluster_linux.sh b/tools/replicator/scripts/install_cluster_linux.sh similarity index 92% rename from tools/replicator/install_cluster_linux.sh rename to tools/replicator/scripts/install_cluster_linux.sh index c56e89c71..fbf79877d 100755 --- a/tools/replicator/install_cluster_linux.sh +++ b/tools/replicator/scripts/install_cluster_linux.sh @@ -17,6 +17,8 @@ case "$1" in (cd ipfs-cluster-service && sudo cp ipfs-cluster-service /usr/local/bin) (cd ipfs-cluster-ctl && sudo cp ipfs-cluster-ctl /usr/local/bin) (cd ipfs-cluster-follow && sudo cp ipfs-cluster-follow /usr/local/bin) + rm *.tar.gz + rm -rf ipfs-cluster-service ipfs-cluster-ctl ipfs-cluster-follow ;; *) echo "unsupported os" From 23c2edc33d23a65610e0f885ca1e04a70806dc7a Mon Sep 17 00:00:00 2001 From: postables Date: Fri, 5 Jun 2020 15:53:26 -0700 Subject: [PATCH 03/18] update readme and make file --- tools/replicator/Makefile | 6 +++++- tools/replicator/README.md | 22 +++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/tools/replicator/Makefile b/tools/replicator/Makefile index a36a548cc..eff447beb 100644 --- a/tools/replicator/Makefile +++ b/tools/replicator/Makefile @@ -1,4 +1,8 @@ .PHONY: cluster-first-start cluster-first-start: ipfs-cluster-service init --consensus crdt - ipfs-cluster-service daemon \ No newline at end of file + ipfs-cluster-service daemon + +.PHONY: follow-public-cluster +follow-public-cluster: + ipfs-cluster-follow 2020pb-dataset run --init 2020pb.temporal.cloud \ No newline at end of file diff --git a/tools/replicator/README.md b/tools/replicator/README.md index 033ab6871..b4778541c 100644 --- a/tools/replicator/README.md +++ b/tools/replicator/README.md @@ -21,24 +21,16 @@ $> install_cluster_linux.sh linux-64bit # usage -This folder contains the needed files and configurations for anyone to start a follow peer, or run their own follow cluster acting as a trusted peer. The trusted peer setup is a little more difficult, and requires running both go-ipfs and ipfs-cluster. +This folder contains the needed files and configurations for anyone to start a follow peer, or run their own follow cluster acting as a trusted peer. The trusted peer setup is a little more difficult, and requires running both go-ipfs and ipfs-cluster. Before running any of these steps make sure you have the following software installed on your local machine: -## trusted peer - -> [for more information click this link](https://cluster.ipfs.io/documentation/collaborative/setup/) - -If using a fresh install of IPFS cluster, you will want to run the following two commands which which initialize the configuration and start the daemon for the first time. This will print some important information that you should note, namely: - - * Generated cluster secret - * Peer ID to use as a trusted peer - * The multiaddress on which it will be reachable by others +* go-ipfs +* ipfs-cluster-follow (only if running a follow peer) +* ipfs-cluster-service (only if running a trusted peer) +* ipfs-cluster-ctl (only if running a trusted peer) -First ensure that you have a valid go-ipfs instance up and running on the machine you are going to use, and run the following two commands: +## trusted peer -```shell -$> ipfs-cluster-service init --consensus crdt -$> ipfs-cluster-service daemon -``` +Trusted peer setup is a bit of an annoying task, and only needs to be done if you are interested in running your own cluster. If so make sure to read the instructions [provided by the ipfs cluster team](https://cluster.ipfs.io/documentation/collaborative/setup/) ## follow peer From 25ac12f180dfb7f95da8e18ca97c73a4638fb9ea Mon Sep 17 00:00:00 2001 From: postables Date: Fri, 5 Jun 2020 16:17:59 -0700 Subject: [PATCH 04/18] tools/replicator: slight wording change to readme --- tools/replicator/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/replicator/README.md b/tools/replicator/README.md index b4778541c..9d5e9931f 100644 --- a/tools/replicator/README.md +++ b/tools/replicator/README.md @@ -2,7 +2,7 @@ > **WARNING**: If you value your privacy, and anonymity do not participate in this cluster. Participating in this cluster, **even** if coming behind an anonymization tool like Tor, or a VPN will likely lead to your real life identity being revealed, as IPFS is incredibly self-doxxing. If you want to participate in this cluster, and value your privacy and anonymity, do so from a cloud based VPS, ideally paid for via anonymous crypto. -The `replicator` tool allows anyone to easily mirror a public set of data on IPFS. It consists of spinning up lightweight an IPFS node, along with a lightweight IPFS Cluster client that follows a CRDT topic publish to by a set of trusted peers that are responsible for updating the "follow list" which is a set of IPFS CIDs that are replicated by the cluster. Anyone following this cluster will pin the data in the follow list locally. +The `replicator` tool allows anyone to easily mirror a public set of data on IPFS. It consists of spinning up an IPFS node, along with a lightweight IPFS Cluster follower client that follows a CRDT topic published to by a set of trusted peers that are responsible for updating the "follow list" which is a set of IPFS CIDs that are replicated by the cluster. Anyone following this cluster will pin the data in the follow list locally. # install From d7f8ae7809e413d969967bb149c019a62be8abbf Mon Sep 17 00:00:00 2001 From: postables Date: Fri, 5 Jun 2020 16:35:28 -0700 Subject: [PATCH 05/18] tools/downloader/pkg: bugfix preventing from downloading anything --- tools/downloader/pkg/downloader.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 52174c6a4..f941414be 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -71,7 +71,10 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { wg = &sync.WaitGroup{} reader = csv.NewReader(resp.Body) ) - for i := 0; maxDownloads != 0 && i < maxDownloads; i++ { + for i := 0; ; i++ { + if maxDownloads != 0 && i >= maxDownloads { + break + } // read the next record record, err := reader.Read() if err != nil && err != io.EOF { From 76755c24d1413d5d9f5a48bb94967c3c5516cea8 Mon Sep 17 00:00:00 2001 From: postables Date: Fri, 5 Jun 2020 18:56:07 -0700 Subject: [PATCH 06/18] tools/downloader: add explainer about max downlaod check --- tools/downloader/pkg/downloader.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index f941414be..10a23e596 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -72,7 +72,10 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { reader = csv.NewReader(resp.Body) ) for i := 0; ; i++ { - if maxDownloads != 0 && i >= maxDownloads { + // the first read from the CSV file will be the header + // so we need to make sure that we factor that in when + // counting max downloads + if maxDownloads != 0 && i >= maxDownloads+1 { break } // read the next record From a0d816cba457110f95c19b8994653336c94da402 Mon Sep 17 00:00:00 2001 From: postables Date: Fri, 5 Jun 2020 19:00:24 -0700 Subject: [PATCH 07/18] tools/downloader: bugfix for file fragments, and add test --- tools/downloader/pkg/downloader.go | 15 +++++++++++ tools/downloader/pkg/downloader_test.go | 36 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 tools/downloader/pkg/downloader_test.go diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 10a23e596..5ef8a26dd 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -5,9 +5,11 @@ import ( "encoding/csv" "fmt" "io" + "io/ioutil" "net/http" "os" "os/exec" + "strings" "sync" "time" @@ -128,6 +130,19 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { } // wait for pending download operations to finish wg.Wait() + // read download dir to check for any file artifacts + infos, err := ioutil.ReadDir(d.path) + if err != nil { + return err + } + for _, info := range infos { + // this was an incorrectly downloaded piece of data, remove + if strings.HasSuffix(info.Name(), ".part") { + if err := os.Remove(d.path + "/" + info.Name()); err != nil { + d.logger.Error("failed to remove file part", zap.String("file", info.Name()), zap.Error(err)) + } + } + } // open csv file to store mappings fh, err := os.Create("name_mapping.csv") if err != nil { diff --git a/tools/downloader/pkg/downloader_test.go b/tools/downloader/pkg/downloader_test.go new file mode 100644 index 000000000..e24e514fd --- /dev/null +++ b/tools/downloader/pkg/downloader_test.go @@ -0,0 +1,36 @@ +package pkg + +import ( + "io/ioutil" + "os" + "strings" + "testing" + "time" +) + +func TestDownloader(t *testing.T) { + var ( + logFile = "test.log" + path = "testdir" + ) + t.Cleanup(func() { + os.RemoveAll("testdir") + os.Remove("test.log") + }) + dl := New(logFile, path, 1) + if _, err := os.Create(path + "/thisisatestfilethatweareusingtotestremovaloffileswith.part"); err != nil { + t.Fatal(err) + } + if err := dl.Run(time.Minute, 2); err != nil { + t.Fatal(err) + } + infos, err := ioutil.ReadDir(path) + if err != nil { + t.Fatal(err) + } + for _, info := range infos { + if strings.HasSuffix(info.Name(), ".part") { + t.Fatal("shouldn't have found .part file") + } + } +} From cab99ed5b0e76bbfbea5ce106802e63d43dad1ad Mon Sep 17 00:00:00 2001 From: postables Date: Sat, 6 Jun 2020 19:26:49 -0700 Subject: [PATCH 08/18] tools/downloader/pkg: handle new csv column name --- tools/downloader/pkg/downloader.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 5ef8a26dd..6641f6bf3 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -22,8 +22,8 @@ import ( const ( /* rows of csv file for easy reference - 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 - state,edit_at,city,name,date,date_text,Link 1,Link 2,Link 3,Link 4,Link 5,Link 6,Link 7,Link 8 + 0 , 1 , 2 , 3 , 4 , 5 , 6, 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 + state,edit_at,city,name,date,date_text,id,Link 1,Link 2,Link 3,Link 4,Link 5,Link 6,Link 7,Link 8 */ url = "https://raw.githubusercontent.com/2020PB/police-brutality/data_build/all-locations.csv" ) @@ -89,8 +89,8 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { break } // skip the first row as it contains column names OR - // skip if the row has less than 7 elements as the 7th element is the start of the video links - if i == 0 || len(record) < 7 { + // skip if the row has less than 8 elements as the 8th element is the start of the video links + if i == 0 || len(record) < 8 { continue } wg.Add(1) From d993a6eacb859cc716a193421d6ca84795e87d4f Mon Sep 17 00:00:00 2001 From: postables Date: Sat, 6 Jun 2020 19:27:14 -0700 Subject: [PATCH 09/18] tools/downlaoder/pkg: correctly start at link index --- tools/downloader/pkg/downloader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 6641f6bf3..71bc5c9f3 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -98,7 +98,7 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { defer wg.Done() // gets the last column so we dont get an out of range panic max := len(record) - 1 - for ii := 6; ii < max; ii++ { + for ii := 7; ii < max; ii++ { // this column is empty, and has no data if record[ii] == "" { continue From 7d9c7bbf1ab2dd555f2739fc26131219d8904ce6 Mon Sep 17 00:00:00 2001 From: postables Date: Sat, 6 Jun 2020 23:21:36 -0700 Subject: [PATCH 10/18] tools/downloader: enable naming files with pbid, and update name mapping --- tools/downloader/pkg/downloader.go | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 71bc5c9f3..6b702a5d9 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -67,6 +67,7 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { results []struct { name string link string + pbid string count int64 } mux = &sync.Mutex{} @@ -96,6 +97,7 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { wg.Add(1) d.wp.Submit(func() { defer wg.Done() + pbid := record[6] // gets the last column so we dont get an out of range panic max := len(record) - 1 for ii := 7; ii < max; ii++ { @@ -106,7 +108,7 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { count := d.count.Inc() d.logger.Info("downloading video", zap.String("name", record[3]), zap.String("url", record[ii])) download := func() error { - cmd := exec.Command("youtube-dl", "-o", d.getName(count), record[ii]) + cmd := exec.Command("youtube-dl", "-o", d.getName(pbid, count), record[ii]) return d.runCommand(cmd, timeout) } if err := download(); err != nil { @@ -117,10 +119,12 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { results = append(results, struct { name string link string + pbid string count int64 }{ name: record[3], link: record[ii], + pbid: pbid, count: count, }) mux.Unlock() @@ -144,17 +148,17 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { } } // open csv file to store mappings - fh, err := os.Create("name_mapping.csv") + fh, err := os.Create(d.path + "/name_mapping.csv") if err != nil { return err } writer := csv.NewWriter(fh) // write the csv file headers - writer.Write([]string{"name", "link", "unique_video_number"}) + writer.Write([]string{"name", "link", "pbid", "link_number"}) mux.Lock() // iterate over all results and add to csv for _, v := range results { - writer.Write([]string{v.name, v.link, fmt.Sprint(v.count)}) + writer.Write([]string{v.name, v.link, v.pbid, fmt.Sprint(v.count)}) } mux.Unlock() // flush csv, writing to disk @@ -188,6 +192,10 @@ func (d *Downloader) runCommand(cmd *exec.Cmd, timeout time.Duration) error { } // uses an atomically increasing counter to prevent any possible chance of filename conflics when running many concurrent downloaders -func (d *Downloader) getName(count int64) string { - return d.path + "/%(id)s." + fmt.Sprint(count) + ".%(ext)s" +func (d *Downloader) getName(id string, count int64) string { + // fallback to youtube id + if id == "" { + id = "%(id)s" + } + return d.path + "/" + id + "." + fmt.Sprint(count) + ".%(ext)s" } From bb66c42aee3f5c6ad1c46c87376a604c6d4a07f0 Mon Sep 17 00:00:00 2001 From: postables Date: Sat, 6 Jun 2020 23:25:58 -0700 Subject: [PATCH 11/18] tools/downloader: readme update --- tools/downloader/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/downloader/README.md b/tools/downloader/README.md index 7acc4e1b0..bd4ec82b2 100644 --- a/tools/downloader/README.md +++ b/tools/downloader/README.md @@ -1,11 +1,11 @@ # downloader -`downloader` is a CLI tool that allows parsing over the all-locations csv file, and downloading all the videos referenced in the CSV file, and can concurrently download multiples videos. Because some of the videos have file names that are longer than the maximum permitted characters in a file path, the videos are not saved under their name, but instead using the videoID as determined by youtube-dl, along with a unique number. This information is then stored in a final CSV file which contains the video name, the link used to download video, as well as the unique number so you can easily determine what video belongs to what incident. Additionally it allows uploading the video data to an IPFS HTTP API endpoint +`downloader` is a CLI tool that allows parsing over the all-locations csv file, and downloading all the videos referenced in the CSV file, and can concurrently download multiples videos. Because some of the videos have file names that are longer than the maximum permitted characters in a file path, the videos are not saved under their name, but instead using a combination of their corresponding pb-id, the link number, and their extension. For posterity sake, there is a file called `name_mapping.csv` stored in the directory containing the video, which maps the name, link, pbid, and link number. -The template for names of videos saved on disk is `[YOUTUBE-DL-VIDEO-ID].[UNIQUE-VIDEO-NUMBER].[EXTENSION]`, and the CSV file has the rows `name,link,unique_video_number`. So for example we have the following entry in the CSV file `Law enforcement gas a crowd chanting “we want peace” right after exiting the building.,https://twitter.com/courtenay_roche/status/1267653137969623040,1`, and two files we have downloaded: +The template for names of videos saved on disk is `[PB-ID].[LINK-NUMBER].[EXTENSION]`, and the CSV file has the rows `name,link,pbid,unique_video_number`. So for example we have the following entry in the CSV file `Law enforcement gas a crowd chanting “we want peace” right after exiting the building.,https://twitter.com/courtenay_roche/status/1267653137969623040,1`, and two files we have downloaded: -* `1267647898365427714.2.mp4` -* `1267653137969623040.1.mp4` +* `ar-bentonville-1.2.mp4` +* `ar-bentonville-1.1.mp4` Given the row in the CSV file, the corresponding video would be `1267653137969623040.1.mp4`. From 4483ea4e67d11cb8eefc63ffe857771b8a312c82 Mon Sep 17 00:00:00 2001 From: postables Date: Sat, 6 Jun 2020 23:29:39 -0700 Subject: [PATCH 12/18] tools/downloader: fix bug with a bad count --- tools/downloader/pkg/downloader.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 6b702a5d9..1dcb62a5c 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -16,7 +16,6 @@ import ( "github.com/panjf2000/ants/v2" "github.com/pkg/errors" "go.bobheadxi.dev/zapx/zapx" - "go.uber.org/atomic" "go.uber.org/zap" ) @@ -33,8 +32,7 @@ type Downloader struct { path string logger *zap.Logger // enables running concurrent downloads - wp *ants.Pool - count *atomic.Int64 + wp *ants.Pool } // New returns a new downloader @@ -52,7 +50,7 @@ func New(logFile, path string, concurrency int) *Downloader { if err != nil { panic(err) } - return &Downloader{path, logger, wp, atomic.NewInt64(0)} + return &Downloader{path, logger, wp} } // Run starts the download process, note that maxDownloads doesn't necessarily equate to number of videos @@ -100,12 +98,13 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { pbid := record[6] // gets the last column so we dont get an out of range panic max := len(record) - 1 + var count int64 = 0 for ii := 7; ii < max; ii++ { + count++ // this column is empty, and has no data if record[ii] == "" { continue } - count := d.count.Inc() d.logger.Info("downloading video", zap.String("name", record[3]), zap.String("url", record[ii])) download := func() error { cmd := exec.Command("youtube-dl", "-o", d.getName(pbid, count), record[ii]) From 36def3926da50ddd1743a8aa03e1399fa5889588 Mon Sep 17 00:00:00 2001 From: postables Date: Sun, 7 Jun 2020 00:48:28 -0700 Subject: [PATCH 13/18] tools/downloader: prevent redownloads, and name mapping overwrite --- tools/downloader/README.md | 2 +- tools/downloader/pkg/downloader.go | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/downloader/README.md b/tools/downloader/README.md index bd4ec82b2..728def198 100644 --- a/tools/downloader/README.md +++ b/tools/downloader/README.md @@ -1,6 +1,6 @@ # downloader -`downloader` is a CLI tool that allows parsing over the all-locations csv file, and downloading all the videos referenced in the CSV file, and can concurrently download multiples videos. Because some of the videos have file names that are longer than the maximum permitted characters in a file path, the videos are not saved under their name, but instead using a combination of their corresponding pb-id, the link number, and their extension. For posterity sake, there is a file called `name_mapping.csv` stored in the directory containing the video, which maps the name, link, pbid, and link number. +`downloader` is a CLI tool that allows parsing over the all-locations csv file, and downloading all the videos referenced in the CSV file, and can concurrently download multiples videos. Because some of the videos have file names that are longer than the maximum permitted characters in a file path, the videos are not saved under their name, but instead using a combination of their corresponding pb-id, the link number, and their extension. For posterity sake, there is a file called `name_mapping.csv` stored in the directory containing the video, which maps the name, link, pbid, and link number. It will not redownload any previously backed up data The template for names of videos saved on disk is `[PB-ID].[LINK-NUMBER].[EXTENSION]`, and the CSV file has the rows `name,link,pbid,unique_video_number`. So for example we have the following entry in the CSV file `Law enforcement gas a crowd chanting “we want peace” right after exiting the building.,https://twitter.com/courtenay_roche/status/1267653137969623040,1`, and two files we have downloaded: diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 1dcb62a5c..3070fbbf7 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -105,6 +105,11 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { if record[ii] == "" { continue } + // if the file already exists, dont redownload + _, err := os.Stat(d.getName(pbid, count)) + if os.IsExist(err) { + continue + } d.logger.Info("downloading video", zap.String("name", record[3]), zap.String("url", record[ii])) download := func() error { cmd := exec.Command("youtube-dl", "-o", d.getName(pbid, count), record[ii]) @@ -146,6 +151,13 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { } } } + if data, err := ioutil.ReadFile(d.path + "/name_mapping.csv"); err != nil { + d.logger.Error("failed to read previous name mapping file, likely doesn't exist", zap.Error(err)) + } else { + if len(data) > 0 { + ioutil.WriteFile(fmt.Sprintf("%s/name_mapping-%v.csv", d.path, time.Now().UnixNano()), data, os.FileMode(0640)) + } + } // open csv file to store mappings fh, err := os.Create(d.path + "/name_mapping.csv") if err != nil { From 81e978d88b88664e05d6719d9d0951798b787084 Mon Sep 17 00:00:00 2001 From: postables Date: Sun, 7 Jun 2020 01:25:23 -0700 Subject: [PATCH 14/18] tools/downloader: optimize csv update --- tools/downloader/pkg/downloader.go | 39 +++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 3070fbbf7..672a5cf8a 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -151,6 +151,7 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { } } } + // backup the previous csv if it exists for posterity if data, err := ioutil.ReadFile(d.path + "/name_mapping.csv"); err != nil { d.logger.Error("failed to read previous name mapping file, likely doesn't exist", zap.Error(err)) } else { @@ -158,14 +159,40 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { ioutil.WriteFile(fmt.Sprintf("%s/name_mapping-%v.csv", d.path, time.Now().UnixNano()), data, os.FileMode(0640)) } } - // open csv file to store mappings - fh, err := os.Create(d.path + "/name_mapping.csv") - if err != nil { - return err + var ( + fh *os.File + records [][]string + ) + // addd the headers to write to the csv + records = append(records, []string{"name", "link", "pbid", "link_number"}) + if _, err := os.Stat(d.path + "/name_mapping.csv"); err == nil { + fh, err = os.Open(d.path + "/name_mapping.csv") + if err != nil { + // fallback to default + d.logger.Error("failed to open existing csv", zap.Error(err)) + } + // file exists, remove the headers as they will be read + records = [][]string{} + for { + record, err := csv.NewReader(fh).Read() + if err != nil && err == io.EOF { + break + } + records = append(records, record) + } + } else { + // open csv file to store mappings + fh, err = os.Create(d.path + "/name_mapping.csv") + if err != nil { + return err + } } writer := csv.NewWriter(fh) - // write the csv file headers - writer.Write([]string{"name", "link", "pbid", "link_number"}) + // write the previous csv file to disk + // if no previous mapping exists, this will just write the headers + for _, record := range records { + writer.Write(record) + } mux.Lock() // iterate over all results and add to csv for _, v := range results { From 314330cdcde898fb474e4757e67142d37f25dcdd Mon Sep 17 00:00:00 2001 From: postables Date: Sun, 7 Jun 2020 01:34:26 -0700 Subject: [PATCH 15/18] tools/downloader: add basic screenshotter --- tools/downloader/go.mod | 3 +- tools/downloader/go.sum | 16 +++++ tools/downloader/pkg/screenshotter.go | 97 +++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 tools/downloader/pkg/screenshotter.go diff --git a/tools/downloader/go.mod b/tools/downloader/go.mod index 5de6a785f..1f69c9737 100644 --- a/tools/downloader/go.mod +++ b/tools/downloader/go.mod @@ -4,10 +4,11 @@ go 1.14 require ( github.com/RTradeLtd/go-ipfs-api/v3 v3.0.0 + github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac + github.com/chromedp/chromedp v0.5.3 github.com/panjf2000/ants/v2 v2.4.1 github.com/pkg/errors v0.8.1 github.com/urfave/cli/v2 v2.2.0 go.bobheadxi.dev/zapx/zapx v0.6.8 - go.uber.org/atomic v1.6.0 go.uber.org/zap v1.15.0 ) diff --git a/tools/downloader/go.sum b/tools/downloader/go.sum index 4300ed44f..94bc0a5ec 100644 --- a/tools/downloader/go.sum +++ b/tools/downloader/go.sum @@ -14,6 +14,10 @@ github.com/btcsuite/websocket v0.0.0-20150119174127-31079b680792/go.mod h1:ghJtE github.com/btcsuite/winsvc v1.0.0/go.mod h1:jsenWakMcC0zFBFurPLEAyrnc/teJEM1O46fmI40EZs= github.com/cheekybits/is v0.0.0-20150225183255-68e9c0620927 h1:SKI1/fuSdodxmNNyVBR8d7X/HuLnRpvvFO0AgyQk764= github.com/cheekybits/is v0.0.0-20150225183255-68e9c0620927/go.mod h1:h/aW8ynjgkuj+NQRlZcDbAbM1ORAbXjXX77sX7T289U= +github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac h1:T7V5BXqnYd55Hj/g5uhDYumg9Fp3rMTS6bykYtTIFX4= +github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g= +github.com/chromedp/chromedp v0.5.3 h1:F9LafxmYpsQhWQBdCs+6Sret1zzeeFyHS5LkRF//Ffg= +github.com/chromedp/chromedp v0.5.3/go.mod h1:YLdPtndaHQ4rCpSpBG+IPpy9JvX0VD+7aaLxYgYj28w= github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= @@ -24,6 +28,12 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0= +github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo= +github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8= +github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo= +github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM= github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -40,6 +50,8 @@ github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlT github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kkdai/bstream v0.0.0-20161212061736-f391b8402d23/go.mod h1:J+Gs4SYgM6CZQHDETBtE9HaSEkGmuNXF86RwHhHUvq4= +github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs= +github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= @@ -49,6 +61,8 @@ github.com/libp2p/go-flow-metrics v0.0.1 h1:0gxuFd2GuK7IIP5pKljLwps6TvcuYgvG7Atq github.com/libp2p/go-flow-metrics v0.0.1/go.mod h1:Iv1GH0sG8DtYN3SVJ2eG221wMiNpZxBdp967ls1g+k8= github.com/libp2p/go-libp2p-core v0.0.1 h1:HSTZtFIq/W5Ue43Zw+uWZyy2Vl5WtF0zDjKN8/DT/1I= github.com/libp2p/go-libp2p-core v0.0.1/go.mod h1:g/VxnTZ/1ygHxH3dKok7Vno1VfpvGcGip57wjTU4fco= +github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM= +github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1 h1:lYpkrQH5ajf0OXOcUbGjvZxxijuBwbbmlSxLiuofa+g= github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1/go.mod h1:pD8RvIylQ358TN4wwqatJ8rNavkEINozVn9DtGI3dfQ= github.com/minio/sha256-simd v0.0.0-20190131020904-2d45a736cd16/go.mod h1:2FMWW+8GMoPweT6+pI63m9YE3Lmw4J71hV56Chs1E/U= @@ -142,6 +156,8 @@ golang.org/x/sys v0.0.0-20190219092855-153ac476189d/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190302025703-b6889370fb10/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42 h1:vEOn+mP2zCOVzKckCZy6YsCtDblrpj/w7B9nxGNELpg= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= diff --git a/tools/downloader/pkg/screenshotter.go b/tools/downloader/pkg/screenshotter.go new file mode 100644 index 000000000..41fb5a0cd --- /dev/null +++ b/tools/downloader/pkg/screenshotter.go @@ -0,0 +1,97 @@ +// Command screenshot is a chromedp example demonstrating how to take a +// screenshot of a specific element and of the entire browser viewport. + +package pkg + +import ( + "context" + "io/ioutil" + "log" + "math" + + "github.com/chromedp/cdproto/emulation" + "github.com/chromedp/cdproto/page" + "github.com/chromedp/chromedp" +) + +/* +copied from and modified from https://github.com/chromedp/examples/blob/master/screenshot/main.go +*/ + +func capture(url string, name string) { + // create context + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + + // capture screenshot of an element + var buf []byte + if err := chromedp.Run(ctx, elementScreenshot(url, `#main`, &buf)); err != nil { + log.Fatal(err) + } + if err := ioutil.WriteFile(name+"-elementScreenshot.png", buf, 0644); err != nil { + log.Fatal(err) + } + + // capture entire browser viewport, returning png with quality=90 + if err := chromedp.Run(ctx, fullScreenshot(url, 10, &buf)); err != nil { + log.Fatal(err) + } + if err := ioutil.WriteFile(name+"-fullScreenshot.png", buf, 0644); err != nil { + log.Fatal(err) + } +} + +// elementScreenshot takes a screenshot of a specific element. +func elementScreenshot(urlstr, sel string, res *[]byte) chromedp.Tasks { + return chromedp.Tasks{ + chromedp.Navigate(urlstr), + chromedp.WaitVisible(sel, chromedp.ByID), + chromedp.Screenshot(sel, res, chromedp.NodeVisible, chromedp.ByID), + } +} + +// fullScreenshot takes a screenshot of the entire browser viewport. +// +// Liberally copied from puppeteer's source. +// +// Note: this will override the viewport emulation settings. +func fullScreenshot(urlstr string, quality int64, res *[]byte) chromedp.Tasks { + return chromedp.Tasks{ + chromedp.Navigate(urlstr), + chromedp.ActionFunc(func(ctx context.Context) error { + // get layout metrics + _, _, contentSize, err := page.GetLayoutMetrics().Do(ctx) + if err != nil { + return err + } + + width, height := int64(math.Ceil(contentSize.Width)), int64(math.Ceil(contentSize.Height)) + + // force viewport emulation + err = emulation.SetDeviceMetricsOverride(width, height, 1, false). + WithScreenOrientation(&emulation.ScreenOrientation{ + Type: emulation.OrientationTypePortraitPrimary, + Angle: 0, + }). + Do(ctx) + if err != nil { + return err + } + + // capture screenshot + *res, err = page.CaptureScreenshot(). + WithQuality(quality). + WithClip(&page.Viewport{ + X: contentSize.X, + Y: contentSize.Y, + Width: contentSize.Width, + Height: contentSize.Height, + Scale: 1, + }).Do(ctx) + if err != nil { + return err + } + return nil + }), + } +} From 6bc9a25c0c1b0b2f5f873bd5a62c2bd428a1d1ee Mon Sep 17 00:00:00 2001 From: postables Date: Sun, 7 Jun 2020 01:45:02 -0700 Subject: [PATCH 16/18] tools/downloader: prevent failed capture from causing a failure --- tools/downloader/main.go | 7 ++++++- tools/downloader/pkg/downloader.go | 9 ++++++++- tools/downloader/pkg/downloader_test.go | 2 +- tools/downloader/pkg/screenshotter.go | 12 ++++++------ 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/tools/downloader/main.go b/tools/downloader/main.go index 2cec14418..fdd663702 100644 --- a/tools/downloader/main.go +++ b/tools/downloader/main.go @@ -19,7 +19,7 @@ func main() { Usage: "starts the downloader", Action: func(c *cli.Context) error { dl := New(c.String("log.file"), c.String("directory"), c.Int("concurrency")) - if err := dl.Run(c.Duration("timeout"), c.Int("max.downloads")); err != nil { + if err := dl.Run(c.Bool("capture.screenshot"), c.Duration("timeout"), c.Int("max.downloads")); err != nil { return err } if c.Bool("upload.to_ipfs") { @@ -84,6 +84,11 @@ func main() { Usage: "enables uploading the video data to any ipfs endpoint", Value: false, }, + &cli.BoolFlag{ + Name: "capture.screenshot", + Aliases: []string{"cs"}, + Usage: "enables optional capturing of the webpage we download media from for additional archiving", + }, }, }, } diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 672a5cf8a..18e92c861 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -55,7 +55,7 @@ func New(logFile, path string, concurrency int) *Downloader { // Run starts the download process, note that maxDownloads doesn't necessarily equate to number of videos // it really means the maximum number of entries in the csv to download, and some entries in the csv may have more than 1 associated video -func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { +func (d *Downloader) Run(takeScreenshots bool, timeout time.Duration, maxDownloads int) error { resp, err := http.Get(url) if err != nil { return err @@ -133,6 +133,13 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { }) mux.Unlock() } + // download the screenshot if specified + // TODO(bonedaddy): enable adding this to the csv, for now it exists alongside everything else + if takeScreenshots { + if err := capture(d.getName(pbid, count), record[ii]); err != nil { + d.logger.Error("failed to capture screenshot", zap.Error(err), zap.String("url", record[ii])) + } + } } }) } diff --git a/tools/downloader/pkg/downloader_test.go b/tools/downloader/pkg/downloader_test.go index e24e514fd..765ef1f08 100644 --- a/tools/downloader/pkg/downloader_test.go +++ b/tools/downloader/pkg/downloader_test.go @@ -21,7 +21,7 @@ func TestDownloader(t *testing.T) { if _, err := os.Create(path + "/thisisatestfilethatweareusingtotestremovaloffileswith.part"); err != nil { t.Fatal(err) } - if err := dl.Run(time.Minute, 2); err != nil { + if err := dl.Run(false, time.Minute, 2); err != nil { t.Fatal(err) } infos, err := ioutil.ReadDir(path) diff --git a/tools/downloader/pkg/screenshotter.go b/tools/downloader/pkg/screenshotter.go index 41fb5a0cd..b7b57699d 100644 --- a/tools/downloader/pkg/screenshotter.go +++ b/tools/downloader/pkg/screenshotter.go @@ -6,7 +6,6 @@ package pkg import ( "context" "io/ioutil" - "log" "math" "github.com/chromedp/cdproto/emulation" @@ -18,7 +17,7 @@ import ( copied from and modified from https://github.com/chromedp/examples/blob/master/screenshot/main.go */ -func capture(url string, name string) { +func capture(url string, name string) error { // create context ctx, cancel := chromedp.NewContext(context.Background()) defer cancel() @@ -26,19 +25,20 @@ func capture(url string, name string) { // capture screenshot of an element var buf []byte if err := chromedp.Run(ctx, elementScreenshot(url, `#main`, &buf)); err != nil { - log.Fatal(err) + return err } if err := ioutil.WriteFile(name+"-elementScreenshot.png", buf, 0644); err != nil { - log.Fatal(err) + return err } // capture entire browser viewport, returning png with quality=90 if err := chromedp.Run(ctx, fullScreenshot(url, 10, &buf)); err != nil { - log.Fatal(err) + return err } if err := ioutil.WriteFile(name+"-fullScreenshot.png", buf, 0644); err != nil { - log.Fatal(err) + return err } + return nil } // elementScreenshot takes a screenshot of a specific element. From 705069ae191564aada575a3d97b402c9bd9fbf3a Mon Sep 17 00:00:00 2001 From: postables Date: Sun, 7 Jun 2020 02:02:03 -0700 Subject: [PATCH 17/18] tools/downloader: fix do not redownload check --- tools/downloader/main.go | 1 + tools/downloader/pkg/downloader.go | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/downloader/main.go b/tools/downloader/main.go index fdd663702..796d4c4ee 100644 --- a/tools/downloader/main.go +++ b/tools/downloader/main.go @@ -88,6 +88,7 @@ func main() { Name: "capture.screenshot", Aliases: []string{"cs"}, Usage: "enables optional capturing of the webpage we download media from for additional archiving", + Value: false, }, }, }, diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 18e92c861..0a4b3e6e7 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -106,8 +106,7 @@ func (d *Downloader) Run(takeScreenshots bool, timeout time.Duration, maxDownloa continue } // if the file already exists, dont redownload - _, err := os.Stat(d.getName(pbid, count)) - if os.IsExist(err) { + if _, err := os.Stat(d.getName(pbid, count)); err == nil { continue } d.logger.Info("downloading video", zap.String("name", record[3]), zap.String("url", record[ii])) From 900e9f0c302113759afd3489c7d577eab33a3777 Mon Sep 17 00:00:00 2001 From: postables Date: Sun, 7 Jun 2020 02:07:31 -0700 Subject: [PATCH 18/18] tools/downloader: have screenshot capture 90% quality --- tools/downloader/pkg/screenshotter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/downloader/pkg/screenshotter.go b/tools/downloader/pkg/screenshotter.go index b7b57699d..12fbfbfd5 100644 --- a/tools/downloader/pkg/screenshotter.go +++ b/tools/downloader/pkg/screenshotter.go @@ -32,7 +32,7 @@ func capture(url string, name string) error { } // capture entire browser viewport, returning png with quality=90 - if err := chromedp.Run(ctx, fullScreenshot(url, 10, &buf)); err != nil { + if err := chromedp.Run(ctx, fullScreenshot(url, 90, &buf)); err != nil { return err } if err := ioutil.WriteFile(name+"-fullScreenshot.png", buf, 0644); err != nil {