-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest.py
52 lines (42 loc) · 1.56 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import pytest
import logging
from waybackprov import *
logging.basicConfig(filename='test.log', filemode='w', level=logging.INFO)
def test_coll():
coll = get_collection('ArchiveIt-Collection-2410')
assert coll['title'] == 'University of Maryland'
def test_get_crawls():
crawls = list(get_crawls('https://mith.umd.edu'))
assert len(crawls) > 0
assert crawls[0]['timestamp']
assert crawls[0]['url']
assert crawls[0]['status']
assert crawls[0]['collections']
assert len(crawls[0]['collections']) > 0
def test_depth():
assert get_depth('ArchiveIt-Collection-2410') == 4
assert get_depth('wikipediaoutlinks00003') == 3
def test_deepest_collection():
colls = [
'ArchiveIt-Partner-408',
'archiveitdigitalcollection',
'web',
'archiveitpartners',
'ArchiveIt-Collection-2410'
]
assert deepest_collection(colls) == 'ArchiveIt-Collection-2410'
def test_loop():
# weirdly, some collections can contain themselves when there is a loop
# e.g. coll1 ∃ coll2 and coll2 ∃ coll1
assert get_depth('ArchiveIt-Partner-1140') == 4
def test_prefix():
crawls = get_crawls('https://twitter.com/Guccifer_2', prefix=True, match='/status/\d+$')
crawl = next(crawls)
assert crawl['url']
def test_cdx():
urls = cdx('https://twitter.com/Guccifer_2', match='/status/\d+$', start_year=2016, end_year=2018)
assert len(list(urls)) == 132
def test_missing():
crawls = list(get_crawls('https://twitter.com/slavresistance/status/1016697918970105857/'))
assert len(crawls) == 0