Skip to content

Commit 46f88b9

Browse files
committed
Fix minor issues and update tests
1 parent 7a53641 commit 46f88b9

File tree

10 files changed

+136
-55
lines changed

10 files changed

+136
-55
lines changed

scrapple/commands/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def run(self):
128128
import json
129129
with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
130130
'w') as f:
131-
json.dump(results, f, indent=3)
131+
json.dump(results, f, indent=4)
132132
elif self.args['--output_type'] == 'csv':
133133
import csv
134134
with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \

scrapple/selectors/css.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ def __init__(self, url):
2929
super(CssSelector, self).__init__(url)
3030

3131

32-
def get_selected_tag(self, selector='', get_one=False, *args, **kwargs):
32+
def get_tree_tag(self, selector='', get_one=False, *args, **kwargs):
3333
sel = cssselect.CSSSelector(selector)
3434
tags = sel(self.tree)
3535
if get_one:
3636
return tags[0]
37-
return tag
37+
return tags

scrapple/selectors/selector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def extract_content(self, selector='', attr='', default='', connector='', *args,
106106
content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
107107
content = content.replace("\n", " ").strip()
108108
else:
109-
tag = self.get_tree_tag(selector=selector)
109+
tag = self.get_tree_tag(selector=selector, get_one=True)
110110
content = tag.get(attr)
111111
if attr in ["href", "src"]:
112112
content = urljoin(self.url, content)

tests/expected_result2.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/expected_result2_20180428.json

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
{
2+
"project": "project2",
3+
"data": [
4+
{
5+
"team": "Atlanta Hawks"
6+
},
7+
{
8+
"team": "Boston Celtics"
9+
},
10+
{
11+
"team": "Brooklyn Nets"
12+
},
13+
{
14+
"team": "Charlotte Hornets"
15+
},
16+
{
17+
"team": "Chicago Bulls"
18+
},
19+
{
20+
"team": "Cleveland Cavaliers"
21+
},
22+
{
23+
"team": "Dallas Mavericks"
24+
},
25+
{
26+
"team": "Denver Nuggets"
27+
},
28+
{
29+
"team": "Detroit Pistons"
30+
},
31+
{
32+
"team": "Golden State Warriors"
33+
},
34+
{
35+
"team": "Houston Rockets"
36+
},
37+
{
38+
"team": "Indiana Pacers"
39+
},
40+
{
41+
"team": "Los Angeles Clippers"
42+
},
43+
{
44+
"team": "Los Angeles Lakers"
45+
},
46+
{
47+
"team": "Memphis Grizzlies"
48+
},
49+
{
50+
"team": "Miami Heat"
51+
},
52+
{
53+
"team": "Milwaukee Bucks"
54+
},
55+
{
56+
"team": "Minnesota Timberwolves"
57+
},
58+
{
59+
"team": "New Orleans Pelicans"
60+
},
61+
{
62+
"team": "New York Knicks"
63+
},
64+
{
65+
"team": "Oklahoma City Thunder"
66+
},
67+
{
68+
"team": "Orlando Magic"
69+
},
70+
{
71+
"team": "Philadelphia 76ers"
72+
},
73+
{
74+
"team": "Phoenix Suns"
75+
},
76+
{
77+
"team": "Portland Trail Blazers"
78+
},
79+
{
80+
"team": "Sacramento Kings"
81+
},
82+
{
83+
"team": "San Antonio Spurs"
84+
},
85+
{
86+
"team": "Toronto Raptors"
87+
},
88+
{
89+
"team": "Utah Jazz"
90+
},
91+
{
92+
"team": "Washington Wizards"
93+
}
94+
]
95+
}

tests/expected_result3.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"project": "project3", "data": [{"unknown": "<unknown>", "speaker": "Kenneth Reitz", "talk_url": "http://pyvideo.org/pycon-us-2013/python-for-humans-1.html", "title": "Python for Humans"}]}
1+
{"project": "project3", "data": [{"show_url": "https://trakt.tv/shows/mr-robot", "year": "2015", "unknown": "<unknown>"}]}

tests/project2.json

Lines changed: 25 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,26 @@
11
{
2-
"project_name": "new_project2",
3-
"selector_type": "css",
4-
"scraping": {
5-
"url": "http://pyvideo.org/events/boston-python-meetup.html",
6-
"data": [
7-
{
8-
"field": "event",
9-
"selector": "h2",
10-
"attr": "text",
11-
"connector": "",
12-
"default": "<event>"
13-
}
14-
],
15-
"next": [
16-
{
17-
"follow_link": "div.content-list div.row h4.entry-title a",
18-
"scraping": {
19-
"data": [
20-
{
21-
"field": "talk",
22-
"selector": "h2",
23-
"attr": "text",
24-
"connector": "",
25-
"default": "<talk>"
26-
}
27-
]
28-
}
29-
}
30-
]
31-
}
32-
}
2+
"project_name": "project2_nba",
3+
"selector_type": "css",
4+
"scraping": {
5+
"url": "https://www.basketball-reference.com/teams/",
6+
"data": [
7+
8+
],
9+
"next": [
10+
{
11+
"follow_link": "#teams_active th > a",
12+
"scraping": {
13+
"data": [
14+
{
15+
"field": "team",
16+
"selector": "div#info h1",
17+
"attr": "text",
18+
"default": "<no_team>",
19+
"connector": ""
20+
}
21+
]
22+
}
23+
}
24+
]
25+
}
26+
}

tests/project3.json

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,35 +2,28 @@
22
"project_name": "project3",
33
"selector_type": "css",
44
"scraping": {
5-
"url": "http://pyvideo.org/pycon-us-2013/python-for-humans-1.html",
5+
"url": "https://trakt.tv/shows/mr-robot",
66
"data": [
77
{
8-
"field": "unknown",
9-
"selector": "h1",
10-
"attr": "text",
11-
"connector": "",
12-
"default": "<unknown>"
13-
},
14-
{
15-
"field": "talk_url",
8+
"field": "show_url",
169
"selector": "url",
1710
"attr": "",
1811
"connector": "",
19-
"default": "<talk_url>"
12+
"default": "<url>"
2013
},
2114
{
22-
"field": "title",
23-
"selector": "h2",
15+
"field": "unknown",
16+
"selector": "h6",
2417
"attr": "text",
2518
"connector": "",
26-
"default": "<title>"
19+
"default": "<unknown>"
2720
},
2821
{
29-
"field": "speaker",
30-
"selector": ".author a",
22+
"field": "year",
23+
"selector": "span.year",
3124
"attr": "text",
3225
"connector": "",
33-
"default": "<speaker>"
26+
"default": "<year>"
3427
}
3528
]
3629
}

tests/test_generate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def test_css_scraper_generate():
5151
with open(os.path.join(os.getcwd(), 'project2.py'), 'r') as f:
5252
program = f.read()
5353
assert_in("from scrapple.selectors.css import CssSelector", program)
54-
assert_in('page0 = CssSelector("http://pyvideo.org/events/boston-python-meetup.html")', program)
54+
assert_in('page0 = CssSelector("https://www.basketball-reference.com/teams/")', program)
5555

5656

5757
def test_nonexistent_project():

tests/test_run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def test_run_css_crawler():
6262
rc.execute_command()
6363
with open(os.path.join(os.getcwd(), 'result2.json'), 'r') as f:
6464
result = json.load(f)
65-
with open(os.path.join(os.getcwd(), 'expected_result2.json'), 'r') as f:
65+
with open(os.path.join(os.getcwd(), 'expected_result2_20180428.json'), 'r') as f:
6666
expected_result = json.load(f)
6767
assert_dict_equal(result, expected_result)
6868

0 commit comments

Comments
 (0)