Skip to content

Commit

Permalink
Fix minor issues and update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexMathew committed Apr 25, 2018
1 parent 7a53641 commit 46f88b9
Show file tree
Hide file tree
Showing 10 changed files with 136 additions and 55 deletions.
2 changes: 1 addition & 1 deletion scrapple/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def run(self):
import json
with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
'w') as f:
json.dump(results, f, indent=3)
json.dump(results, f, indent=4)
elif self.args['--output_type'] == 'csv':
import csv
with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
Expand Down
4 changes: 2 additions & 2 deletions scrapple/selectors/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def __init__(self, url):
super(CssSelector, self).__init__(url)


def get_selected_tag(self, selector='', get_one=False, *args, **kwargs):
def get_tree_tag(self, selector='', get_one=False, *args, **kwargs):
sel = cssselect.CSSSelector(selector)
tags = sel(self.tree)
if get_one:
return tags[0]
return tag
return tags
2 changes: 1 addition & 1 deletion scrapple/selectors/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def extract_content(self, selector='', attr='', default='', connector='', *args,
content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
content = content.replace("\n", " ").strip()
else:
tag = self.get_tree_tag(selector=selector)
tag = self.get_tree_tag(selector=selector, get_one=True)
content = tag.get(attr)
if attr in ["href", "src"]:
content = urljoin(self.url, content)
Expand Down
1 change: 0 additions & 1 deletion tests/expected_result2.json

This file was deleted.

95 changes: 95 additions & 0 deletions tests/expected_result2_20180428.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
{
"project": "project2",
"data": [
{
"team": "Atlanta Hawks"
},
{
"team": "Boston Celtics"
},
{
"team": "Brooklyn Nets"
},
{
"team": "Charlotte Hornets"
},
{
"team": "Chicago Bulls"
},
{
"team": "Cleveland Cavaliers"
},
{
"team": "Dallas Mavericks"
},
{
"team": "Denver Nuggets"
},
{
"team": "Detroit Pistons"
},
{
"team": "Golden State Warriors"
},
{
"team": "Houston Rockets"
},
{
"team": "Indiana Pacers"
},
{
"team": "Los Angeles Clippers"
},
{
"team": "Los Angeles Lakers"
},
{
"team": "Memphis Grizzlies"
},
{
"team": "Miami Heat"
},
{
"team": "Milwaukee Bucks"
},
{
"team": "Minnesota Timberwolves"
},
{
"team": "New Orleans Pelicans"
},
{
"team": "New York Knicks"
},
{
"team": "Oklahoma City Thunder"
},
{
"team": "Orlando Magic"
},
{
"team": "Philadelphia 76ers"
},
{
"team": "Phoenix Suns"
},
{
"team": "Portland Trail Blazers"
},
{
"team": "Sacramento Kings"
},
{
"team": "San Antonio Spurs"
},
{
"team": "Toronto Raptors"
},
{
"team": "Utah Jazz"
},
{
"team": "Washington Wizards"
}
]
}
2 changes: 1 addition & 1 deletion tests/expected_result3.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"project": "project3", "data": [{"unknown": "<unknown>", "speaker": "Kenneth Reitz", "talk_url": "http://pyvideo.org/pycon-us-2013/python-for-humans-1.html", "title": "Python for Humans"}]}
{"project": "project3", "data": [{"show_url": "https://trakt.tv/shows/mr-robot", "year": "2015", "unknown": "<unknown>"}]}
56 changes: 25 additions & 31 deletions tests/project2.json
Original file line number Diff line number Diff line change
@@ -1,32 +1,26 @@
{
"project_name": "new_project2",
"selector_type": "css",
"scraping": {
"url": "http://pyvideo.org/events/boston-python-meetup.html",
"data": [
{
"field": "event",
"selector": "h2",
"attr": "text",
"connector": "",
"default": "<event>"
}
],
"next": [
{
"follow_link": "div.content-list div.row h4.entry-title a",
"scraping": {
"data": [
{
"field": "talk",
"selector": "h2",
"attr": "text",
"connector": "",
"default": "<talk>"
}
]
}
}
]
}
}
"project_name": "project2_nba",
"selector_type": "css",
"scraping": {
"url": "https://www.basketball-reference.com/teams/",
"data": [

],
"next": [
{
"follow_link": "#teams_active th > a",
"scraping": {
"data": [
{
"field": "team",
"selector": "div#info h1",
"attr": "text",
"default": "<no_team>",
"connector": ""
}
]
}
}
]
}
}
25 changes: 9 additions & 16 deletions tests/project3.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,28 @@
"project_name": "project3",
"selector_type": "css",
"scraping": {
"url": "http://pyvideo.org/pycon-us-2013/python-for-humans-1.html",
"url": "https://trakt.tv/shows/mr-robot",
"data": [
{
"field": "unknown",
"selector": "h1",
"attr": "text",
"connector": "",
"default": "<unknown>"
},
{
"field": "talk_url",
"field": "show_url",
"selector": "url",
"attr": "",
"connector": "",
"default": "<talk_url>"
"default": "<url>"
},
{
"field": "title",
"selector": "h2",
"field": "unknown",
"selector": "h6",
"attr": "text",
"connector": "",
"default": "<title>"
"default": "<unknown>"
},
{
"field": "speaker",
"selector": ".author a",
"field": "year",
"selector": "span.year",
"attr": "text",
"connector": "",
"default": "<speaker>"
"default": "<year>"
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_css_scraper_generate():
with open(os.path.join(os.getcwd(), 'project2.py'), 'r') as f:
program = f.read()
assert_in("from scrapple.selectors.css import CssSelector", program)
assert_in('page0 = CssSelector("http://pyvideo.org/events/boston-python-meetup.html")', program)
assert_in('page0 = CssSelector("https://www.basketball-reference.com/teams/")', program)


def test_nonexistent_project():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_run_css_crawler():
rc.execute_command()
with open(os.path.join(os.getcwd(), 'result2.json'), 'r') as f:
result = json.load(f)
with open(os.path.join(os.getcwd(), 'expected_result2.json'), 'r') as f:
with open(os.path.join(os.getcwd(), 'expected_result2_20180428.json'), 'r') as f:
expected_result = json.load(f)
assert_dict_equal(result, expected_result)

Expand Down

0 comments on commit 46f88b9

Please sign in to comment.