diff --git a/tools/downloader/README.md b/tools/downloader/README.md index 7acc4e1b0..728def198 100644 --- a/tools/downloader/README.md +++ b/tools/downloader/README.md @@ -1,11 +1,11 @@ # downloader -`downloader` is a CLI tool that allows parsing over the all-locations csv file, and downloading all the videos referenced in the CSV file, and can concurrently download multiples videos. Because some of the videos have file names that are longer than the maximum permitted characters in a file path, the videos are not saved under their name, but instead using the videoID as determined by youtube-dl, along with a unique number. This information is then stored in a final CSV file which contains the video name, the link used to download video, as well as the unique number so you can easily determine what video belongs to what incident. Additionally it allows uploading the video data to an IPFS HTTP API endpoint +`downloader` is a CLI tool that allows parsing over the all-locations csv file, and downloading all the videos referenced in the CSV file, and can concurrently download multiples videos. Because some of the videos have file names that are longer than the maximum permitted characters in a file path, the videos are not saved under their name, but instead using a combination of their corresponding pb-id, the link number, and their extension. For posterity sake, there is a file called `name_mapping.csv` stored in the directory containing the video, which maps the name, link, pbid, and link number. It will not redownload any previously backed up data -The template for names of videos saved on disk is `[YOUTUBE-DL-VIDEO-ID].[UNIQUE-VIDEO-NUMBER].[EXTENSION]`, and the CSV file has the rows `name,link,unique_video_number`. So for example we have the following entry in the CSV file `Law enforcement gas a crowd chanting “we want peace” right after exiting the building.,https://twitter.com/courtenay_roche/status/1267653137969623040,1`, and two files we have downloaded: +The template for names of videos saved on disk is `[PB-ID].[LINK-NUMBER].[EXTENSION]`, and the CSV file has the rows `name,link,pbid,unique_video_number`. So for example we have the following entry in the CSV file `Law enforcement gas a crowd chanting “we want peace” right after exiting the building.,https://twitter.com/courtenay_roche/status/1267653137969623040,1`, and two files we have downloaded: -* `1267647898365427714.2.mp4` -* `1267653137969623040.1.mp4` +* `ar-bentonville-1.2.mp4` +* `ar-bentonville-1.1.mp4` Given the row in the CSV file, the corresponding video would be `1267653137969623040.1.mp4`. diff --git a/tools/downloader/go.mod b/tools/downloader/go.mod index 5de6a785f..1f69c9737 100644 --- a/tools/downloader/go.mod +++ b/tools/downloader/go.mod @@ -4,10 +4,11 @@ go 1.14 require ( github.com/RTradeLtd/go-ipfs-api/v3 v3.0.0 + github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac + github.com/chromedp/chromedp v0.5.3 github.com/panjf2000/ants/v2 v2.4.1 github.com/pkg/errors v0.8.1 github.com/urfave/cli/v2 v2.2.0 go.bobheadxi.dev/zapx/zapx v0.6.8 - go.uber.org/atomic v1.6.0 go.uber.org/zap v1.15.0 ) diff --git a/tools/downloader/go.sum b/tools/downloader/go.sum index 4300ed44f..94bc0a5ec 100644 --- a/tools/downloader/go.sum +++ b/tools/downloader/go.sum @@ -14,6 +14,10 @@ github.com/btcsuite/websocket v0.0.0-20150119174127-31079b680792/go.mod h1:ghJtE github.com/btcsuite/winsvc v1.0.0/go.mod h1:jsenWakMcC0zFBFurPLEAyrnc/teJEM1O46fmI40EZs= github.com/cheekybits/is v0.0.0-20150225183255-68e9c0620927 h1:SKI1/fuSdodxmNNyVBR8d7X/HuLnRpvvFO0AgyQk764= github.com/cheekybits/is v0.0.0-20150225183255-68e9c0620927/go.mod h1:h/aW8ynjgkuj+NQRlZcDbAbM1ORAbXjXX77sX7T289U= +github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac h1:T7V5BXqnYd55Hj/g5uhDYumg9Fp3rMTS6bykYtTIFX4= +github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g= +github.com/chromedp/chromedp v0.5.3 h1:F9LafxmYpsQhWQBdCs+6Sret1zzeeFyHS5LkRF//Ffg= +github.com/chromedp/chromedp v0.5.3/go.mod h1:YLdPtndaHQ4rCpSpBG+IPpy9JvX0VD+7aaLxYgYj28w= github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= @@ -24,6 +28,12 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0= +github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo= +github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8= +github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo= +github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM= github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -40,6 +50,8 @@ github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlT github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kkdai/bstream v0.0.0-20161212061736-f391b8402d23/go.mod h1:J+Gs4SYgM6CZQHDETBtE9HaSEkGmuNXF86RwHhHUvq4= +github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs= +github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= @@ -49,6 +61,8 @@ github.com/libp2p/go-flow-metrics v0.0.1 h1:0gxuFd2GuK7IIP5pKljLwps6TvcuYgvG7Atq github.com/libp2p/go-flow-metrics v0.0.1/go.mod h1:Iv1GH0sG8DtYN3SVJ2eG221wMiNpZxBdp967ls1g+k8= github.com/libp2p/go-libp2p-core v0.0.1 h1:HSTZtFIq/W5Ue43Zw+uWZyy2Vl5WtF0zDjKN8/DT/1I= github.com/libp2p/go-libp2p-core v0.0.1/go.mod h1:g/VxnTZ/1ygHxH3dKok7Vno1VfpvGcGip57wjTU4fco= +github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM= +github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1 h1:lYpkrQH5ajf0OXOcUbGjvZxxijuBwbbmlSxLiuofa+g= github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1/go.mod h1:pD8RvIylQ358TN4wwqatJ8rNavkEINozVn9DtGI3dfQ= github.com/minio/sha256-simd v0.0.0-20190131020904-2d45a736cd16/go.mod h1:2FMWW+8GMoPweT6+pI63m9YE3Lmw4J71hV56Chs1E/U= @@ -142,6 +156,8 @@ golang.org/x/sys v0.0.0-20190219092855-153ac476189d/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190302025703-b6889370fb10/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42 h1:vEOn+mP2zCOVzKckCZy6YsCtDblrpj/w7B9nxGNELpg= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= diff --git a/tools/downloader/main.go b/tools/downloader/main.go index 2cec14418..796d4c4ee 100644 --- a/tools/downloader/main.go +++ b/tools/downloader/main.go @@ -19,7 +19,7 @@ func main() { Usage: "starts the downloader", Action: func(c *cli.Context) error { dl := New(c.String("log.file"), c.String("directory"), c.Int("concurrency")) - if err := dl.Run(c.Duration("timeout"), c.Int("max.downloads")); err != nil { + if err := dl.Run(c.Bool("capture.screenshot"), c.Duration("timeout"), c.Int("max.downloads")); err != nil { return err } if c.Bool("upload.to_ipfs") { @@ -84,6 +84,12 @@ func main() { Usage: "enables uploading the video data to any ipfs endpoint", Value: false, }, + &cli.BoolFlag{ + Name: "capture.screenshot", + Aliases: []string{"cs"}, + Usage: "enables optional capturing of the webpage we download media from for additional archiving", + Value: false, + }, }, }, } diff --git a/tools/downloader/pkg/downloader.go b/tools/downloader/pkg/downloader.go index 52174c6a4..0a4b3e6e7 100644 --- a/tools/downloader/pkg/downloader.go +++ b/tools/downloader/pkg/downloader.go @@ -5,23 +5,24 @@ import ( "encoding/csv" "fmt" "io" + "io/ioutil" "net/http" "os" "os/exec" + "strings" "sync" "time" "github.com/panjf2000/ants/v2" "github.com/pkg/errors" "go.bobheadxi.dev/zapx/zapx" - "go.uber.org/atomic" "go.uber.org/zap" ) const ( /* rows of csv file for easy reference - 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 - state,edit_at,city,name,date,date_text,Link 1,Link 2,Link 3,Link 4,Link 5,Link 6,Link 7,Link 8 + 0 , 1 , 2 , 3 , 4 , 5 , 6, 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 + state,edit_at,city,name,date,date_text,id,Link 1,Link 2,Link 3,Link 4,Link 5,Link 6,Link 7,Link 8 */ url = "https://raw.githubusercontent.com/2020PB/police-brutality/data_build/all-locations.csv" ) @@ -31,8 +32,7 @@ type Downloader struct { path string logger *zap.Logger // enables running concurrent downloads - wp *ants.Pool - count *atomic.Int64 + wp *ants.Pool } // New returns a new downloader @@ -50,12 +50,12 @@ func New(logFile, path string, concurrency int) *Downloader { if err != nil { panic(err) } - return &Downloader{path, logger, wp, atomic.NewInt64(0)} + return &Downloader{path, logger, wp} } // Run starts the download process, note that maxDownloads doesn't necessarily equate to number of videos // it really means the maximum number of entries in the csv to download, and some entries in the csv may have more than 1 associated video -func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { +func (d *Downloader) Run(takeScreenshots bool, timeout time.Duration, maxDownloads int) error { resp, err := http.Get(url) if err != nil { return err @@ -65,13 +65,20 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { results []struct { name string link string + pbid string count int64 } mux = &sync.Mutex{} wg = &sync.WaitGroup{} reader = csv.NewReader(resp.Body) ) - for i := 0; maxDownloads != 0 && i < maxDownloads; i++ { + for i := 0; ; i++ { + // the first read from the CSV file will be the header + // so we need to make sure that we factor that in when + // counting max downloads + if maxDownloads != 0 && i >= maxDownloads+1 { + break + } // read the next record record, err := reader.Read() if err != nil && err != io.EOF { @@ -81,24 +88,30 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { break } // skip the first row as it contains column names OR - // skip if the row has less than 7 elements as the 7th element is the start of the video links - if i == 0 || len(record) < 7 { + // skip if the row has less than 8 elements as the 8th element is the start of the video links + if i == 0 || len(record) < 8 { continue } wg.Add(1) d.wp.Submit(func() { defer wg.Done() + pbid := record[6] // gets the last column so we dont get an out of range panic max := len(record) - 1 - for ii := 6; ii < max; ii++ { + var count int64 = 0 + for ii := 7; ii < max; ii++ { + count++ // this column is empty, and has no data if record[ii] == "" { continue } - count := d.count.Inc() + // if the file already exists, dont redownload + if _, err := os.Stat(d.getName(pbid, count)); err == nil { + continue + } d.logger.Info("downloading video", zap.String("name", record[3]), zap.String("url", record[ii])) download := func() error { - cmd := exec.Command("youtube-dl", "-o", d.getName(count), record[ii]) + cmd := exec.Command("youtube-dl", "-o", d.getName(pbid, count), record[ii]) return d.runCommand(cmd, timeout) } if err := download(); err != nil { @@ -109,31 +122,87 @@ func (d *Downloader) Run(timeout time.Duration, maxDownloads int) error { results = append(results, struct { name string link string + pbid string count int64 }{ name: record[3], link: record[ii], + pbid: pbid, count: count, }) mux.Unlock() } + // download the screenshot if specified + // TODO(bonedaddy): enable adding this to the csv, for now it exists alongside everything else + if takeScreenshots { + if err := capture(d.getName(pbid, count), record[ii]); err != nil { + d.logger.Error("failed to capture screenshot", zap.Error(err), zap.String("url", record[ii])) + } + } } }) } // wait for pending download operations to finish wg.Wait() - // open csv file to store mappings - fh, err := os.Create("name_mapping.csv") + // read download dir to check for any file artifacts + infos, err := ioutil.ReadDir(d.path) if err != nil { return err } + for _, info := range infos { + // this was an incorrectly downloaded piece of data, remove + if strings.HasSuffix(info.Name(), ".part") { + if err := os.Remove(d.path + "/" + info.Name()); err != nil { + d.logger.Error("failed to remove file part", zap.String("file", info.Name()), zap.Error(err)) + } + } + } + // backup the previous csv if it exists for posterity + if data, err := ioutil.ReadFile(d.path + "/name_mapping.csv"); err != nil { + d.logger.Error("failed to read previous name mapping file, likely doesn't exist", zap.Error(err)) + } else { + if len(data) > 0 { + ioutil.WriteFile(fmt.Sprintf("%s/name_mapping-%v.csv", d.path, time.Now().UnixNano()), data, os.FileMode(0640)) + } + } + var ( + fh *os.File + records [][]string + ) + // addd the headers to write to the csv + records = append(records, []string{"name", "link", "pbid", "link_number"}) + if _, err := os.Stat(d.path + "/name_mapping.csv"); err == nil { + fh, err = os.Open(d.path + "/name_mapping.csv") + if err != nil { + // fallback to default + d.logger.Error("failed to open existing csv", zap.Error(err)) + } + // file exists, remove the headers as they will be read + records = [][]string{} + for { + record, err := csv.NewReader(fh).Read() + if err != nil && err == io.EOF { + break + } + records = append(records, record) + } + } else { + // open csv file to store mappings + fh, err = os.Create(d.path + "/name_mapping.csv") + if err != nil { + return err + } + } writer := csv.NewWriter(fh) - // write the csv file headers - writer.Write([]string{"name", "link", "unique_video_number"}) + // write the previous csv file to disk + // if no previous mapping exists, this will just write the headers + for _, record := range records { + writer.Write(record) + } mux.Lock() // iterate over all results and add to csv for _, v := range results { - writer.Write([]string{v.name, v.link, fmt.Sprint(v.count)}) + writer.Write([]string{v.name, v.link, v.pbid, fmt.Sprint(v.count)}) } mux.Unlock() // flush csv, writing to disk @@ -167,6 +236,10 @@ func (d *Downloader) runCommand(cmd *exec.Cmd, timeout time.Duration) error { } // uses an atomically increasing counter to prevent any possible chance of filename conflics when running many concurrent downloaders -func (d *Downloader) getName(count int64) string { - return d.path + "/%(id)s." + fmt.Sprint(count) + ".%(ext)s" +func (d *Downloader) getName(id string, count int64) string { + // fallback to youtube id + if id == "" { + id = "%(id)s" + } + return d.path + "/" + id + "." + fmt.Sprint(count) + ".%(ext)s" } diff --git a/tools/downloader/pkg/downloader_test.go b/tools/downloader/pkg/downloader_test.go new file mode 100644 index 000000000..765ef1f08 --- /dev/null +++ b/tools/downloader/pkg/downloader_test.go @@ -0,0 +1,36 @@ +package pkg + +import ( + "io/ioutil" + "os" + "strings" + "testing" + "time" +) + +func TestDownloader(t *testing.T) { + var ( + logFile = "test.log" + path = "testdir" + ) + t.Cleanup(func() { + os.RemoveAll("testdir") + os.Remove("test.log") + }) + dl := New(logFile, path, 1) + if _, err := os.Create(path + "/thisisatestfilethatweareusingtotestremovaloffileswith.part"); err != nil { + t.Fatal(err) + } + if err := dl.Run(false, time.Minute, 2); err != nil { + t.Fatal(err) + } + infos, err := ioutil.ReadDir(path) + if err != nil { + t.Fatal(err) + } + for _, info := range infos { + if strings.HasSuffix(info.Name(), ".part") { + t.Fatal("shouldn't have found .part file") + } + } +} diff --git a/tools/downloader/pkg/screenshotter.go b/tools/downloader/pkg/screenshotter.go new file mode 100644 index 000000000..12fbfbfd5 --- /dev/null +++ b/tools/downloader/pkg/screenshotter.go @@ -0,0 +1,97 @@ +// Command screenshot is a chromedp example demonstrating how to take a +// screenshot of a specific element and of the entire browser viewport. + +package pkg + +import ( + "context" + "io/ioutil" + "math" + + "github.com/chromedp/cdproto/emulation" + "github.com/chromedp/cdproto/page" + "github.com/chromedp/chromedp" +) + +/* +copied from and modified from https://github.com/chromedp/examples/blob/master/screenshot/main.go +*/ + +func capture(url string, name string) error { + // create context + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + + // capture screenshot of an element + var buf []byte + if err := chromedp.Run(ctx, elementScreenshot(url, `#main`, &buf)); err != nil { + return err + } + if err := ioutil.WriteFile(name+"-elementScreenshot.png", buf, 0644); err != nil { + return err + } + + // capture entire browser viewport, returning png with quality=90 + if err := chromedp.Run(ctx, fullScreenshot(url, 90, &buf)); err != nil { + return err + } + if err := ioutil.WriteFile(name+"-fullScreenshot.png", buf, 0644); err != nil { + return err + } + return nil +} + +// elementScreenshot takes a screenshot of a specific element. +func elementScreenshot(urlstr, sel string, res *[]byte) chromedp.Tasks { + return chromedp.Tasks{ + chromedp.Navigate(urlstr), + chromedp.WaitVisible(sel, chromedp.ByID), + chromedp.Screenshot(sel, res, chromedp.NodeVisible, chromedp.ByID), + } +} + +// fullScreenshot takes a screenshot of the entire browser viewport. +// +// Liberally copied from puppeteer's source. +// +// Note: this will override the viewport emulation settings. +func fullScreenshot(urlstr string, quality int64, res *[]byte) chromedp.Tasks { + return chromedp.Tasks{ + chromedp.Navigate(urlstr), + chromedp.ActionFunc(func(ctx context.Context) error { + // get layout metrics + _, _, contentSize, err := page.GetLayoutMetrics().Do(ctx) + if err != nil { + return err + } + + width, height := int64(math.Ceil(contentSize.Width)), int64(math.Ceil(contentSize.Height)) + + // force viewport emulation + err = emulation.SetDeviceMetricsOverride(width, height, 1, false). + WithScreenOrientation(&emulation.ScreenOrientation{ + Type: emulation.OrientationTypePortraitPrimary, + Angle: 0, + }). + Do(ctx) + if err != nil { + return err + } + + // capture screenshot + *res, err = page.CaptureScreenshot(). + WithQuality(quality). + WithClip(&page.Viewport{ + X: contentSize.X, + Y: contentSize.Y, + Width: contentSize.Width, + Height: contentSize.Height, + Scale: 1, + }).Do(ctx) + if err != nil { + return err + } + return nil + }), + } +} diff --git a/tools/replicator/Makefile b/tools/replicator/Makefile new file mode 100644 index 000000000..eff447beb --- /dev/null +++ b/tools/replicator/Makefile @@ -0,0 +1,8 @@ +.PHONY: cluster-first-start +cluster-first-start: + ipfs-cluster-service init --consensus crdt + ipfs-cluster-service daemon + +.PHONY: follow-public-cluster +follow-public-cluster: + ipfs-cluster-follow 2020pb-dataset run --init 2020pb.temporal.cloud \ No newline at end of file diff --git a/tools/replicator/README.md b/tools/replicator/README.md new file mode 100644 index 000000000..9d5e9931f --- /dev/null +++ b/tools/replicator/README.md @@ -0,0 +1,43 @@ +# replicator + +> **WARNING**: If you value your privacy, and anonymity do not participate in this cluster. Participating in this cluster, **even** if coming behind an anonymization tool like Tor, or a VPN will likely lead to your real life identity being revealed, as IPFS is incredibly self-doxxing. If you want to participate in this cluster, and value your privacy and anonymity, do so from a cloud based VPS, ideally paid for via anonymous crypto. + +The `replicator` tool allows anyone to easily mirror a public set of data on IPFS. It consists of spinning up an IPFS node, along with a lightweight IPFS Cluster follower client that follows a CRDT topic published to by a set of trusted peers that are responsible for updating the "follow list" which is a set of IPFS CIDs that are replicated by the cluster. Anyone following this cluster will pin the data in the follow list locally. + +# install + +If you run a 64-bit version of linux you can use the `install_cluster_linux.sh` bash script to install the needed components. If you don't run a 64-bit version of Linux, you should update the script to work on your platform and then use the script, otherwise please see the following URLS: + +* [ipfs-cluster-ctl](https://dist.ipfs.io/#ipfs-cluster-ctl) +* [ipfs-cluster-follow](https://dist.ipfs.io/#ipfs-cluster-follow) +* [ipfs-cluster-service](https://dist.ipfs.io/#ipfs-cluster-service) + + +To install the cluster tooling on 64-bit linux with the aforementioned script invoke as follows: + +```shell +$> install_cluster_linux.sh linux-64bit +``` + +# usage + +This folder contains the needed files and configurations for anyone to start a follow peer, or run their own follow cluster acting as a trusted peer. The trusted peer setup is a little more difficult, and requires running both go-ipfs and ipfs-cluster. Before running any of these steps make sure you have the following software installed on your local machine: + +* go-ipfs +* ipfs-cluster-follow (only if running a follow peer) +* ipfs-cluster-service (only if running a trusted peer) +* ipfs-cluster-ctl (only if running a trusted peer) + +## trusted peer + +Trusted peer setup is a bit of an annoying task, and only needs to be done if you are interested in running your own cluster. If so make sure to read the instructions [provided by the ipfs cluster team](https://cluster.ipfs.io/documentation/collaborative/setup/) + +## follow peer + +First ensure that you valid a valid go-ipfs instance up and running on the machine you are rusing, and run the following command: + +```shell +$> ipfs-cluster-follow 2020pb-dataset run --init 2020pb.temporal.cloud +``` + +This will start the cluster follow peer and being replicating the cluster data \ No newline at end of file diff --git a/tools/replicator/configs/follow_peer/service.json b/tools/replicator/configs/follow_peer/service.json new file mode 100644 index 000000000..9f71d7a31 --- /dev/null +++ b/tools/replicator/configs/follow_peer/service.json @@ -0,0 +1,103 @@ +{ + "cluster": { + "follower_mode": true, + "peername": "2020pb-follower", + "secret": "91125f520ba285491b9bd1b62e26f784f8fd9577e5348171382d7c72127168f2", + "leave_on_shutdown": false, + "listen_multiaddress": [ + "/ip4/0.0.0.0/tcp/9096", + "/ip4/0.0.0.0/udp/9096/quic" + ], + "enable_relay_hop": true, + "connection_manager": { + "high_water": 400, + "low_water": 100, + "grace_period": "2m0s" + }, + "state_sync_interval": "5m0s", + "pin_recover_interval": "12m0s", + "replication_factor_min": -1, + "replication_factor_max": -1, + "monitor_ping_interval": "15s", + "peer_watch_interval": "5s", + "mdns_interval": "10s", + "disable_repinning": false, + "peer_addresses": [ + "/ip4/207.6.222.55/tcp/9097/p2p/12D3KooWLREvKqLLefpzADz6tHW1kSEUsdGpq8jJkLtCbQ5Srauh" + ] + }, + "consensus": { + "crdt": { + "cluster_name": "2020pb-dataset", + "trusted_peers": [ + "12D3KooWLREvKqLLefpzADz6tHW1kSEUsdGpq8jJkLtCbQ5Srauh" + ] + } + }, + "ipfs_connector": { + "ipfshttp": { + "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", + "connect_swarms_delay": "30s", + "ipfs_request_timeout": "5m0s", + "pin_timeout": "24h0m0s", + "unpin_timeout": "3h0m0s", + "repogc_timeout": "24h0m0s" + } + }, + "pin_tracker": { + "stateless": { + "concurrent_pins": 10 + } + }, + "monitor": { + "pubsubmon": { + "check_interval": "15s", + "failure_threshold": 3 + } + }, + "informer": { + "disk": { + "metric_ttl": "30s", + "metric_type": "freespace" + } + }, + "observations": { + "metrics": { + "enable_stats": false, + "prometheus_endpoint": "/ip4/127.0.0.1/tcp/8888", + "reporting_interval": "2s" + }, + "tracing": { + "enable_tracing": false, + "jaeger_agent_endpoint": "/ip4/0.0.0.0/udp/6831", + "sampling_prob": 0.3, + "service_name": "cluster-daemon" + } + }, + "datastore": { + "badger": { + "badger_options": { + "dir": "", + "value_dir": "", + "sync_writes": true, + "table_loading_mode": 0, + "value_log_loading_mode": 0, + "num_versions_to_keep": 1, + "max_table_size": 67108864, + "level_size_multiplier": 10, + "max_levels": 7, + "value_threshold": 32, + "num_memtables": 5, + "num_level_zero_tables": 5, + "num_level_zero_tables_stall": 10, + "level_one_size": 268435456, + "value_log_file_size": 1073741823, + "value_log_max_entries": 1000000, + "num_compactors": 2, + "compact_l_0_on_close": false, + "read_only": false, + "truncate": false + } + } + } +} \ No newline at end of file diff --git a/tools/replicator/configs/trusted_peer/service.json b/tools/replicator/configs/trusted_peer/service.json new file mode 100644 index 000000000..967079859 --- /dev/null +++ b/tools/replicator/configs/trusted_peer/service.json @@ -0,0 +1,138 @@ +{ + "cluster": { + "peername": "2020pb-trusted-1", + "secret": "91125f520ba285491b9bd1b62e26f784f8fd9577e5348171382d7c72127168f2", + "leave_on_shutdown": false, + "listen_multiaddress": [ + "/ip4/0.0.0.0/tcp/9097", + "/ip4/0.0.0.0/udp/9097/quic" + ], + "enable_relay_hop": true, + "connection_manager": { + "high_water": 400, + "low_water": 100, + "grace_period": "2m0s" + }, + "state_sync_interval": "5m0s", + "pin_recover_interval": "12m0s", + "replication_factor_min": -1, + "replication_factor_max": -1, + "monitor_ping_interval": "15s", + "peer_watch_interval": "5s", + "mdns_interval": "10s", + "disable_repinning": false, + "peer_addresses": [] + }, + "consensus": { + "crdt": { + "cluster_name": "2020pb-dataset", + "trusted_peers": [ + "12D3KooWLREvKqLLefpzADz6tHW1kSEUsdGpq8jJkLtCbQ5Srauh" + ] + } + }, + "api": { + "ipfsproxy": { + "listen_multiaddress": "/ip4/127.0.0.1/tcp/9099", + "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", + "log_file": "", + "read_timeout": "0s", + "read_header_timeout": "5s", + "write_timeout": "0s", + "idle_timeout": "1m0s", + "max_header_bytes": 4096 + }, + "restapi": { + "http_listen_multiaddress": "/ip4/127.0.0.1/tcp/9098", + "read_timeout": "0s", + "read_header_timeout": "5s", + "write_timeout": "0s", + "idle_timeout": "2m0s", + "max_header_bytes": 4096, + "basic_auth_credentials": null, + "http_log_file": "", + "headers": {}, + "cors_allowed_origins": [ + "*" + ], + "cors_allowed_methods": [ + "GET" + ], + "cors_allowed_headers": [], + "cors_exposed_headers": [ + "Content-Type", + "X-Stream-Output", + "X-Chunked-Output", + "X-Content-Length" + ], + "cors_allow_credentials": true, + "cors_max_age": "0s" + } + }, + "ipfs_connector": { + "ipfshttp": { + "node_multiaddress": "/ip4/127.0.0.1/tcp/5001", + "connect_swarms_delay": "30s", + "ipfs_request_timeout": "5m0s", + "pin_timeout": "24h0m0s", + "unpin_timeout": "3h0m0s", + "repogc_timeout": "24h0m0s" + } + }, + "pin_tracker": { + "stateless": { + "concurrent_pins": 10 + } + }, + "monitor": { + "pubsubmon": { + "check_interval": "15s", + "failure_threshold": 3 + } + }, + "informer": { + "disk": { + "metric_ttl": "30s", + "metric_type": "freespace" + } + }, + "observations": { + "metrics": { + "enable_stats": false, + "prometheus_endpoint": "/ip4/127.0.0.1/tcp/8888", + "reporting_interval": "2s" + }, + "tracing": { + "enable_tracing": false, + "jaeger_agent_endpoint": "/ip4/0.0.0.0/udp/6831", + "sampling_prob": 0.3, + "service_name": "cluster-daemon" + } + }, + "datastore": { + "badger": { + "badger_options": { + "dir": "", + "value_dir": "", + "sync_writes": true, + "table_loading_mode": 2, + "value_log_loading_mode": 2, + "num_versions_to_keep": 1, + "max_table_size": 67108864, + "level_size_multiplier": 10, + "max_levels": 7, + "value_threshold": 32, + "num_memtables": 5, + "num_level_zero_tables": 5, + "num_level_zero_tables_stall": 10, + "level_one_size": 268435456, + "value_log_file_size": 1073741823, + "value_log_max_entries": 1000000, + "num_compactors": 2, + "compact_l_0_on_close": false, + "read_only": false, + "truncate": false + } + } + } + } \ No newline at end of file diff --git a/tools/replicator/scripts/install_cluster_linux.sh b/tools/replicator/scripts/install_cluster_linux.sh new file mode 100755 index 000000000..fbf79877d --- /dev/null +++ b/tools/replicator/scripts/install_cluster_linux.sh @@ -0,0 +1,27 @@ +#! /bin/bash + +# yoinked from https://github.com/RTradeLtd/cord19-collaborative-cluster/blob/master/scripts/install_cluster.sh +# multi-platform cluster download script, only supports linux 64-bit right now + +VERSION="v0.12.1" +OS="" +case "$1" in + linux-64bit) + OS="linux-amd64" + wget "https://dist.ipfs.io/ipfs-cluster-service/${VERSION}/ipfs-cluster-service_${VERSION}_${OS}.tar.gz" + wget "https://dist.ipfs.io/ipfs-cluster-ctl/${VERSION}/ipfs-cluster-ctl_${VERSION}_${OS}.tar.gz" + wget "https://dist.ipfs.io/ipfs-cluster-follow/${VERSION}/ipfs-cluster-follow_${VERSION}_${OS}.tar.gz" + tar zxvf "ipfs-cluster-service_${VERSION}_${OS}.tar.gz" + tar zxvf "ipfs-cluster-ctl_${VERSION}_${OS}.tar.gz" + tar zxvf "ipfs-cluster-follow_${VERSION}_${OS}.tar.gz" + (cd ipfs-cluster-service && sudo cp ipfs-cluster-service /usr/local/bin) + (cd ipfs-cluster-ctl && sudo cp ipfs-cluster-ctl /usr/local/bin) + (cd ipfs-cluster-follow && sudo cp ipfs-cluster-follow /usr/local/bin) + rm *.tar.gz + rm -rf ipfs-cluster-service ipfs-cluster-ctl ipfs-cluster-follow + ;; + *) + echo "unsupported os" + exit 2 + ;; +esac \ No newline at end of file