From 4aeaf9d5884812b31c272a9b3b52b0cf1d2faa38 Mon Sep 17 00:00:00 2001
From: Alex Mathew <alexmathew003@gmail.com>
Date: Fri, 20 Apr 2018 10:55:19 +0530
Subject: [PATCH 1/6] Remove uses of eval (how did I not know that was bad?)

---
 scrapple/commands/run.py          | 12 +++++++-----
 scrapple/utils/dynamicdispatch.py |  8 +++++++-
 2 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/scrapple/commands/run.py b/scrapple/commands/run.py
index b938561..50a54e1 100644
--- a/scrapple/commands/run.py
+++ b/scrapple/commands/run.py
@@ -11,7 +11,8 @@
 from colorama import Back, Fore, init
 
 from scrapple.commands import command
-from scrapple.selectors import css, xpath
+from scrapple.selectors.css import CssSelector
+from scrapple.selectors.xpath import XpathSelector
 from scrapple.utils.config import (InvalidConfigException, extract_fieldnames,
                                    traverse_next, validate_config)
 
@@ -74,10 +75,11 @@ def execute_command(self):
 
 
     def run(self):
-        selectorClass = getattr(
-                eval(self.config['selector_type']), 
-                self.config['selector_type'].title() + 'Selector'
-                )
+        selectorClassMapping = {
+            'xpath': XpathSelector,
+            'css': CssSelector
+        }
+        selectorClass = selectorClassMapping.get(self.config['selector_type'].lower())
         results = dict()
         results['project'] = self.args['<projectname>']
         results['data'] = list()
diff --git a/scrapple/utils/dynamicdispatch.py b/scrapple/utils/dynamicdispatch.py
index d68bc22..fdb67a6 100644
--- a/scrapple/utils/dynamicdispatch.py
+++ b/scrapple/utils/dynamicdispatch.py
@@ -13,5 +13,11 @@ def get_command_class(command):
     :return: The command class corresponding to the selected command
     """
     from scrapple.commands import genconfig, generate, run, web
-    cmdClass = getattr(eval(command), command.title() + 'Command')
+    commandMapping = {
+    	'genconfig': genconfig,
+    	'generate': generate,
+    	'run': run,
+    	'web': web
+    }
+    cmdClass = getattr(commandMapping.get(command), command.title() + 'Command')
     return cmdClass

From bfee6f7b58fdf56012e27dde22bd83273ab647cd Mon Sep 17 00:00:00 2001
From: Alex Mathew <alexmathew003@gmail.com>
Date: Fri, 20 Apr 2018 11:12:13 +0530
Subject: [PATCH 2/6] Refactor extract_content and extract_links in the
 selector classes

---
 scrapple/selectors/css.py      | 89 +++++-----------------------------
 scrapple/selectors/selector.py | 75 ++++++++++++++++++++++++++--
 scrapple/selectors/xpath.py    | 81 ++++---------------------------
 3 files changed, 91 insertions(+), 154 deletions(-)

diff --git a/scrapple/selectors/css.py b/scrapple/selectors/css.py
index 350cdce..e2b2740 100644
--- a/scrapple/selectors/css.py
+++ b/scrapple/selectors/css.py
@@ -11,11 +11,6 @@
 from scrapple.selectors.selector import Selector
 from scrapple.utils.text import make_ascii
 
-try:
-	from urlparse import urljoin
-except ImportError:
-	from urllib.parse import urljoin
-
 
 class CssSelector(Selector):
 	"""
@@ -23,6 +18,9 @@ class CssSelector(Selector):
 
 	"""
 	
+	__selector_type__ = 'CSS'
+
+	
 	def __init__(self, url):
 		"""
 		The ``Selector`` class acts as the super class for this class.
@@ -31,81 +29,16 @@ def __init__(self, url):
 		super(CssSelector, self).__init__(url)
 
 
-	def extract_content(self, *args, **kwargs):
-		"""
-		Method for performing the content extraction for the given CSS selector.
-
-		The cssselect library is used to handle CSS selector expressions. \
-		XPath expressions have a higher speed of execution, so the given CSS selector \
-		expression is translated into the corresponding XPath expression, by the \
-		``cssselect.CSSSelector`` class. This selector can be used to extract content \
-		from the element tree corresponding to the fetched web page.
-
-		If the selector is "url", the URL of the current web page is returned.
-		Otherwise, the selector expression is used to extract content. The particular \
-		attribute to be extracted ("text", "href", etc.) is specified in the method \
-		arguments, and this is used to extract the required content. If the content \
-		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
-		to convert the relative path into an absolute path.
+	def get_selected_tag(self, selector='', *args, **kwargs):
+		sel = cssselect.CSSSelector(selector)
+		tag = sel(self.tree)[0]
+		return tag
 
-		If the selector does not fetch any content, the default value is returned. \
-		If no default value is specified, an exception is raised.
 
-		:param selector: The CSS selector expression
-		:param attr: The attribute to be extracted from the selected tag
-		:param default: The default value to be used if the selector does not return any data
-		:return: The extracted content
-
-		"""
-		try:
-			selector, attr, default, connector = [kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector']]
-			if selector == "url":
-				return self.url
-			sel = cssselect.CSSSelector(selector)
-			if attr == "text":
-				tag = sel(self.tree)[0]
-				content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
-				content = content.replace("\n", " ").strip()				
-			else:
-				content = sel(self.tree)[0].get(attr)
-				if attr in ["href", "src"]:
-					content = urljoin(self.url, content)
-			return content
-		except IndexError:
-			if default is not "":
-				return default
-			raise Exception("There is no content for the selector " + selector)
-
-
-	def extract_links(self, *args, **kwargs):
-		"""
-		Method for performing the link extraction for the crawler implementation.
-
-		As in the extract_content method, the cssselect library is used to translate \
-		the CSS selector expression into an XPath expression. 
-
-		The selector passed as the argument is a selector to point to the anchor tags \
-		that the crawler should pass through. A list of links is obtained, and the links \
-		are iterated through. The relative paths are converted into absolute paths and \
-		a ``CssSelector`` object is created with the URL of the next page as the argument \
-		and this created object is yielded. 
-
-		The extract_links method basically generates ``CssSelector`` objects for all of \
-		the links to be crawled through.
-
-		:param selector: The selector for the anchor tags to be crawled through
-		:return: A ``CssSelector`` object for every page to be crawled through 
-		
-		"""
-		try:
-			selector = kwargs.get('selector', '')
-			sel = cssselect.CSSSelector(selector)
-			links = sel(self.tree)
-			for link in links:
-				next_url = urljoin(self.url, link.get('href'))
-				yield CssSelector(next_url)
-		except Exception:
-			raise Exception("Invalid CSS selector " + selector)
+	def get_links_for_crawling(self, selector='', *args, **kwargs):
+		sel = cssselect.CSSSelector(selector)
+		links = sel(self.tree)
+		return links
 
 
 	def extract_rows(self, *args, **kwargs):
diff --git a/scrapple/selectors/selector.py b/scrapple/selectors/selector.py
index bf4c569..27f25ca 100644
--- a/scrapple/selectors/selector.py
+++ b/scrapple/selectors/selector.py
@@ -11,6 +11,12 @@
 import requests
 from lxml import etree
 
+try:
+	from urlparse import urljoin
+except ImportError:
+	from urllib.parse import urljoin
+
+
 requests.warnings.filterwarnings('ignore')
 
 
@@ -19,7 +25,9 @@ class Selector(object):
 	This class defines the basic ``Selector`` object. 
 
 	"""
+	__selector_type__ = ''
 	
+
 	def __init__(self, url):
 		"""
 		The URL of the web page to be loaded is validated - ensuring the schema has \
@@ -63,21 +71,80 @@ def __init__(self, url):
 			raise Exception('Ensure that you are connected to the Internet and that the page exists')
 
 
-	def extract_content(self, *args, **kwargs):
+	def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs):
 		"""
 		Method for performing the content extraction for the particular selector type. \
-		A detailed description is provided in the derived classes. 
 
+		If the selector is "url", the URL of the current web page is returned.
+		Otherwise, the selector expression is used to extract content. The particular \
+		attribute to be extracted ("text", "href", etc.) is specified in the method \
+		arguments, and this is used to extract the required content. If the content \
+		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
+		to convert the relative path into an absolute path.
+
+		If the selector does not fetch any content, the default value is returned. \
+		If no default value is specified, an exception is raised.
+
+		:param selector: The XPath expression
+		:param attr: The attribute to be extracted from the selected tag
+		:param default: The default value to be used if the selector does not return any data
+		:param connector: String connector for list of data returned for a particular selector
+		:return: The extracted content
 		"""
+		try:
+			if selector == "url":
+				return self.url
+			if attr == "text":
+				tag = self.get_selected_tag(selector=selector)
+				content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
+				content = content.replace("\n", " ").strip()
+			else:
+				tag = self.get_selected_tag(selector=selector)
+				content = tag.get(attr)
+				if attr in ["href", "src"]:
+					content = urljoin(self.url, content)
+			return content
+		except IndexError:
+			if default is not "":
+				return default
+			raise Exception("There is no content for the %s selector - %s" % (self.__selector_type__, selector))
+		except XPathError:
+			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
+
+
+	def get_selected_tag(self, selector='', *args, **kwargs):
 		raise NotImplementedError
 
 
-	def extract_links(self, *args, **kwargs):
+	def extract_links(self, selector='', *args, **kwargs):
 		"""
 		Method for performing the link extraction for the crawler. \
-		A detailed description is provided in the derived classes.
 
+		The selector passed as the argument is a selector to point to the anchor tags \
+		that the crawler should pass through. A list of links is obtained, and the links \
+		are iterated through. The relative paths are converted into absolute paths and \
+		a ``XpathSelector``/``CssSelector`` object (as is the case) is created with the URL of the next page as the argument \
+		and this created object is yielded. 
+
+		The extract_links method basically generates ``XpathSelector``/``CssSelector`` objects for all of \
+		the links to be crawled through.
+
+		:param selector: The selector for the anchor tags to be crawled through
+		:return: A ``XpathSelector``/``CssSelector`` object for every page to be crawled through 
+		
 		"""
+		try:
+			links = self.get_links_for_crawling(selector=selector)
+			for link in links:
+				next_url = urljoin(self.url, link.get('href'))
+				yield type(self)(next_url)
+		except XPathError:
+			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
+		except Exception:
+			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
+
+
+	def get_links_for_crawling(self, selector='', *args, **kwargs):
 		raise NotImplementedError
 
 
diff --git a/scrapple/selectors/xpath.py b/scrapple/selectors/xpath.py
index e92b31c..cae3dea 100644
--- a/scrapple/selectors/xpath.py
+++ b/scrapple/selectors/xpath.py
@@ -11,11 +11,6 @@
 from scrapple.selectors.selector import Selector
 from scrapple.utils.text import make_ascii
 
-try:
-	from urlparse import urljoin
-except ImportError:
-	from urllib.parse import urljoin
-
 
 class XpathSelector(Selector):
 	"""
@@ -23,6 +18,9 @@ class XpathSelector(Selector):
 
 	"""
 	
+	__selector_type__ = 'XPath'
+
+
 	def __init__(self, url):
 		"""
 		The ``Selector`` class acts as the super class for this class.
@@ -31,75 +29,14 @@ def __init__(self, url):
 		super(XpathSelector, self).__init__(url)
 
 
-	def extract_content(self, *args, **kwargs):
-		"""
-		Method for performing the content extraction for the given XPath expression.
-
-		The XPath selector expression can be used to extract content \
-		from the element tree corresponding to the fetched web page.
-
-		If the selector is "url", the URL of the current web page is returned.
-		Otherwise, the selector expression is used to extract content. The particular \
-		attribute to be extracted ("text", "href", etc.) is specified in the method \
-		arguments, and this is used to extract the required content. If the content \
-		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
-		to convert the relative path into an absolute path.
-
-		If the selector does not fetch any content, the default value is returned. \
-		If no default value is specified, an exception is raised.
-
-		:param selector: The XPath expression
-		:param attr: The attribute to be extracted from the selected tag
-		:param default: The default value to be used if the selector does not return any data
-		:return: The extracted content
-
-		"""
-		try:
-			selector, attr, default, connector = [kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector']]
-			if selector == "url":
-				return self.url
-			if attr == "text":
-				tag = self.tree.xpath(selector)[0]
-				content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
-				content = content.replace("\n", " ").strip()
-			else:
-				content = self.tree.xpath(selector)[0].get(attr)
-				if attr in ["href", "src"]:
-					content = urljoin(self.url, content)
-			return content
-		except IndexError:
-			if default is not "":
-				return default
-			raise Exception("There is no content for the selector " + selector)
-		except XPathError:
-			raise Exception("Invalid XPath selector " + selector)
-
-
-	def extract_links(self, *args, **kwargs):
-		"""
-		Method for performing the link extraction for the crawler.
-
-		The selector passed as the argument is a selector to point to the anchor tags \
-		that the crawler should pass through. A list of links is obtained, and the links \
-		are iterated through. The relative paths are converted into absolute paths and \
-		a ``XpathSelector`` object is created with the URL of the next page as the argument \
-		and this created object is yielded. 
+	def get_selected_tag(self, selector='', *args, **kwargs):
+		tag = self.tree.xpath(selector)[0]
+		return tag
 
-		The extract_links method basically generates ``XpathSelector`` objects for all of \
-		the links to be crawled through.
 
-		:param selector: The selector for the anchor tags to be crawled through
-		:return: A ``XpathSelector`` object for every page to be crawled through 
-		
-		"""
-		try:
-			selector = kwargs.get('selector', '')
-			links = self.tree.xpath(selector)
-			for link in links:
-				next_url = urljoin(self.url, link.get('href'))
-				yield XpathSelector(next_url)
-		except XPathError:
-			raise Exception("Invalid XPath selector " + selector)
+	def get_links_for_crawling(self, selector='', *args, **kwargs):
+		links = self.tree.xpath(selector)
+		return links
 
 
 	def extract_rows(self, *args, **kwargs):

From 802e01cc032b38cff0faa08d0ede7ebf2be3f7c3 Mon Sep 17 00:00:00 2001
From: Alex Mathew <alexmathew003@gmail.com>
Date: Fri, 20 Apr 2018 11:13:23 +0530
Subject: [PATCH 3/6] Add base selector class in docs

---
 docs/implementation/selectors.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/implementation/selectors.rst b/docs/implementation/selectors.rst
index ac90e38..45be37f 100644
--- a/docs/implementation/selectors.rst
+++ b/docs/implementation/selectors.rst
@@ -15,6 +15,11 @@ These selector types are implemented through the ``XpathSelector`` and ``CssSele
 In the super class, the URL of the web page to be loaded is validated - ensuring the schema has been specified, and that the URL is valid. A HTTP GET request is made to load the web page, and the HTML content of this fetched web page is used to generate the :ref:`element tree <concepts-structure>`. This is the element tree that will be parsed to extract the necessary content.
 
 
+.. automodule:: scrapple.selectors.selector
+
+.. autoclass:: scrapple.selectors.selector.Selector
+   :members:
+
 .. automodule:: scrapple.selectors.xpath
 
 .. autoclass:: scrapple.selectors.xpath.XpathSelector

From f651bcf1fec154ee7cd591b632c2fc37f80621b8 Mon Sep 17 00:00:00 2001
From: Alex Mathew <alexmathew003@gmail.com>
Date: Fri, 20 Apr 2018 11:20:44 +0530
Subject: [PATCH 4/6] Fix xpatherror import

---
 scrapple/selectors/selector.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrapple/selectors/selector.py b/scrapple/selectors/selector.py
index 27f25ca..bd01b9b 100644
--- a/scrapple/selectors/selector.py
+++ b/scrapple/selectors/selector.py
@@ -10,6 +10,7 @@
 
 import requests
 from lxml import etree
+from lxml.etree import XPathError
 
 try:
 	from urlparse import urljoin

From 7a53641ecd0cd39c8fecbfd1de644a58598efc68 Mon Sep 17 00:00:00 2001
From: Alex Mathew <alexmathew003@gmail.com>
Date: Fri, 20 Apr 2018 14:43:12 +0530
Subject: [PATCH 5/6] Refactor extract_tabular and related methods in the
 selector classes

---
 scrapple/selectors/css.py      | 141 +-------------------------------
 scrapple/selectors/selector.py | 143 +++++++++++++++++++++++++++++----
 scrapple/selectors/xpath.py    | 139 ++------------------------------
 3 files changed, 137 insertions(+), 286 deletions(-)

diff --git a/scrapple/selectors/css.py b/scrapple/selectors/css.py
index e2b2740..12155d1 100644
--- a/scrapple/selectors/css.py
+++ b/scrapple/selectors/css.py
@@ -29,142 +29,9 @@ def __init__(self, url):
 		super(CssSelector, self).__init__(url)
 
 
-	def get_selected_tag(self, selector='', *args, **kwargs):
+	def get_selected_tag(self, selector='', get_one=False, *args, **kwargs):
 		sel = cssselect.CSSSelector(selector)
-		tag = sel(self.tree)[0]
+		tags = sel(self.tree)
+		if get_one:
+			return tags[0]
 		return tag
-
-
-	def get_links_for_crawling(self, selector='', *args, **kwargs):
-		sel = cssselect.CSSSelector(selector)
-		links = sel(self.tree)
-		return links
-
-
-	def extract_rows(self, *args, **kwargs):
-		"""
-		Row data extraction for extract_tabular
-		"""
-		result_list = []
-		result = kwargs.get('result', {})
-
-		try:
-			sel = cssselect.CSSSelector(kwargs.get('selector', ''))
-			values = sel(self.tree)
-			if len(kwargs.get('table_headers', [])) >= len(values):
-				from itertools import izip_longest
-				pairs = izip_longest(kwargs.get('table_headers', []), values, fillvalue=kwargs.get('default', ''))
-			else:
-				from itertools import izip
-				pairs = izip(kwargs.get('table_headers', []), values)
-			for head, val in pairs:
-				if kwargs.get('verbosity', 0) > 1:
-					print("\nExtracting", head, "attribute", sep=' ', end='')
-				if kwargs.get('attr', 'text') == "text":
-					try:
-						content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in val.itertext()])
-					except Exception:
-						content = kwargs.get('default', '')
-					content = content.replace("\n", " ").strip()
-				else:
-					content = val.get(kwargs.get('attr', 'text'))
-					if kwargs.get('attr', 'text') in ["href", "src"]:
-						content = urljoin(self.url, content)
-				result[head] = content
-			result_list.append(result)
-		except TypeError:
-			raise Exception("Selector expression string to be provided. Got " + kwargs.get('selector', ''))
-
-		return result_list
-
-
-	def extract_columns(self, *args, **kwargs):
-		"""
-		Column data extraction for extract_tabular
-		"""
-		result_list = []
-		result = kwargs.get('result', {})
-
-		try:
-			if type(kwargs.get('selector', '')) in [str, unicode]:
-				selectors = [kwargs.get('selector', '')]
-			elif type(kwargs.get('selector', '')) == list:
-				selectors = kwargs.get('selector', '')
-			else:
-				raise Exception("Use a list of selector expressions for the various columns")
-			from itertools import izip, count
-			pairs = izip(kwargs.get('table_headers', []), selectors)
-			columns = {}
-			for head, selector in pairs:
-				sel = cssselect.CSSSelector(selector)
-				columns[head] = sel(self.tree)
-			try:
-				for i in count(start=0):
-					r = result.copy()
-					for head in columns.keys():
-						if kwargs.get('verbosity', 0) > 1:
-							print("\nExtracting", head, "attribute", sep=' ', end='')
-						col = columns[head][i]
-						if kwargs.get('attr', 'text') == "text":
-							try:
-								content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in col.itertext()])
-							except Exception:
-								content = kwargs.get('default', '')
-							content = content.replace("\n", " ").strip()
-						else:
-							content = col.get(kwargs.get('attr', 'text'))
-							if kwargs.get('attr', 'text') in ["href", "src"]:
-								content = urljoin(self.url, content)
-						r[head] = content
-					result_list.append(r)
-			except IndexError:
-				pass
-		except TypeError:
-			raise Exception("Selector expression string to be provided. Got " + selector)
-
-		return result_list
-
-
-	def extract_tabular(self, *args, **kwargs):
-		"""
-		Method for performing the extraction of tabular data.
-
-		As in the extract_content method, the cssselect library is used to translate \
-		the CSS selector expression into an XPath expression. 
-
-		:param result: A dictionary containing the extracted data so far
-		:param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \
-		A row extraction is when there is a single row to be extracted and mapped to a set of headers. \
-		A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings.
-		:param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers
-		:param prefix: A prefix to be added to each header
-		:param suffix: A suffix to be added to each header
-		:param selector: For row extraction, this is a selector that gives the row to be extracted. \
-		For column extraction, this is a list of selectors for each column.
-		:param attr: The attribute to be extracted from the selected tag
-		:param default: The default value to be used if the selector does not return any data
-		:param verbosity: The verbosity set as the argument for scrapple run
-		:return: A 2-tuple containing the list of all the column headers extracted and the list of \
-		dictionaries which contain (header, content) pairs
-		"""
-		result = kwargs.get('result', {})
-		result_list = []
-		if type(kwargs.get('header', [])) in [str, unicode]:
-			try:
-				sel = cssselect.CSSSelector(kwargs.get('header', []))
-				header_list = sel(self.tree)
-				table_headers = [kwargs.get('prefix', '') + h.text + kwargs.get('suffix', '') for h in header_list]
-				if len(table_headers) == 0:
-					raise Exception("Invalid CSS selector " + kwargs.get('header', []))
-			except TypeError:
-				raise Exception("Selector expression string to be provided. Got " + kwargs.get('header', []))
-		else:
-			table_headers = [kwargs.get('prefix', '') + h + kwargs.get('suffix', '') for h in kwargs.get('header', [])]
-		if kwargs.get('table_type', 'rows') not in ["rows", "columns"]:
-			raise Exception("Specify 'rows' or 'columns' in table_type")
-		kwargs.update({'table_headers': table_headers})
-		if kwargs.get('table_type', 'rows') == "rows":
-			result_list = self.extract_rows(**kwargs)
-		else:
-			result_list = self.extract_columns(**kwargs)
-		return table_headers, result_list
diff --git a/scrapple/selectors/selector.py b/scrapple/selectors/selector.py
index bd01b9b..85a388a 100644
--- a/scrapple/selectors/selector.py
+++ b/scrapple/selectors/selector.py
@@ -8,6 +8,8 @@
 
 import random
 
+from scrapple.utils.text import make_ascii
+
 import requests
 from lxml import etree
 from lxml.etree import XPathError
@@ -72,6 +74,10 @@ def __init__(self, url):
 			raise Exception('Ensure that you are connected to the Internet and that the page exists')
 
 
+	def get_tree_tag(self, selector='', get_one=False, *args, **kwargs):
+		raise NotImplementedError
+
+
 	def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs):
 		"""
 		Method for performing the content extraction for the particular selector type. \
@@ -93,14 +99,14 @@ def extract_content(self, selector='', attr='', default='', connector='', *args,
 		:return: The extracted content
 		"""
 		try:
-			if selector == "url":
+			if selector.lower() == "url":
 				return self.url
-			if attr == "text":
-				tag = self.get_selected_tag(selector=selector)
+			if attr.lower() == "text":
+				tag = self.get_tree_tag(selector=selector, get_one=True)
 				content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
 				content = content.replace("\n", " ").strip()
 			else:
-				tag = self.get_selected_tag(selector=selector)
+				tag = self.get_tree_tag(selector=selector)
 				content = tag.get(attr)
 				if attr in ["href", "src"]:
 					content = urljoin(self.url, content)
@@ -113,10 +119,6 @@ def extract_content(self, selector='', attr='', default='', connector='', *args,
 			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
 
 
-	def get_selected_tag(self, selector='', *args, **kwargs):
-		raise NotImplementedError
-
-
 	def extract_links(self, selector='', *args, **kwargs):
 		"""
 		Method for performing the link extraction for the crawler. \
@@ -135,7 +137,7 @@ def extract_links(self, selector='', *args, **kwargs):
 		
 		"""
 		try:
-			links = self.get_links_for_crawling(selector=selector)
+			links = self.get_tree_tag(selector=selector)
 			for link in links:
 				next_url = urljoin(self.url, link.get('href'))
 				yield type(self)(next_url)
@@ -145,14 +147,125 @@ def extract_links(self, selector='', *args, **kwargs):
 			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
 
 
-	def get_links_for_crawling(self, selector='', *args, **kwargs):
-		raise NotImplementedError
+	def extract_tabular(self, header='', prefix='', suffix='', table_type='', *args, **kwargs):
+		"""
+		Method for performing the tabular data extraction. \
+
+		:param result: A dictionary containing the extracted data so far
+		:param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \
+		A row extraction is when there is a single row to be extracted and mapped to a set of headers. \
+		A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings.
+		:param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers
+		:param prefix: A prefix to be added to each header
+		:param suffix: A suffix to be added to each header
+		:param selector: For row extraction, this is a selector that gives the row to be extracted. \
+		For column extraction, this is a list of selectors for each column.
+		:param attr: The attribute to be extracted from the selected tag
+		:param default: The default value to be used if the selector does not return any data
+		:param verbosity: The verbosity set as the argument for scrapple run
+		:return: A 2-tuple containing the list of all the column headers extracted and the list of \
+		dictionaries which contain (header, content) pairs
+		"""
+		if type(header) in [str, unicode]:
+			try:
+				header_list = self.get_tree_tag(header)
+				table_headers = [prefix + h.text + suffix for h in header_list]
+			except XPathError:
+				raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header))
+			except Exception:
+				raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header))
+		else:
+			table_headers = [prefix + h + suffix for h in header]
+		if len(table_headers) == 0:
+			raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header))
+		if table_type not in ["rows", "columns"]:
+			raise Exception("Specify 'rows' or 'columns' in table_type")
+		if table_type == "rows":
+			result_list = self.extract_rows(table_headers=table_headers, *args, **kwargs)
+		else:
+			result_list = self.extract_columns(table_headers=table_headers, *args, **kwargs)
+		return table_headers, result_list
 
 
-	def extract_tabular(self, *args, **kwargs):
+	def extract_rows(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs):
 		"""
-		Method for performing the tabular data extraction. \
-		A detailed description is provided in the derived classes.
+		Row data extraction for extract_tabular
+		"""
+		result_list = []
 
+		try:
+			values = self.get_tree_tag(selector)
+			if len(table_headers) >= len(values):
+				from itertools import izip_longest
+				pairs = izip_longest(table_headers, values, fillvalue=default)
+			else:
+				from itertools import izip
+				pairs = izip(table_headers, values)
+			for head, val in pairs:
+				if verbosity > 1:
+					print("\nExtracting", head, "attribute", sep=' ', end='')
+				if attr.lower() == "text":
+					try:
+						content = connector.join([make_ascii(x).strip() for x in val.itertext()])
+					except Exception:
+						content = default
+					content = content.replace("\n", " ").strip()
+				else:
+					content = val.get(attr)
+					if attr in ["href", "src"]:
+						content = urljoin(self.url, content)
+				result[head] = content
+			result_list.append(result)
+		except XPathError:
+			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
+		except TypeError:
+			raise Exception("Selector expression string to be provided. Got " + selector)
+
+		return result_list
+
+
+	def extract_columns(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs):
 		"""
-		raise NotImplementedError
+		Column data extraction for extract_tabular
+		"""
+		result_list = []
+
+		try:
+			if type(selector) in [str, unicode]:
+				selectors = [selector]
+			elif type(selector) == list:
+				selectors = selector[:]
+			else:
+				raise Exception("Use a list of selector expressions for the various columns")
+			from itertools import izip, count
+			pairs = izip(table_headers, selectors)
+			columns = {}
+			for head, selector in pairs:
+				columns[head] = self.get_tree_tag(selector)
+			try:
+				for i in count(start=0):
+					r = result.copy()
+					for head in columns.keys():
+						if verbosity > 1:
+							print("\nExtracting", head, "attribute", sep=' ', end='')
+						col = columns[head][i]
+						if attr == "text":
+							try:
+								content = connector.join([make_ascii(x).strip() for x in col.itertext()])
+							except Exception:
+								content = default
+							content = content.replace("\n", " ").strip()
+						else:
+							content = col.get(attr)
+							if attr in ["href", "src"]:
+								content = urljoin(self.url, content)
+						r[head] = content
+					result_list.append(r)
+			except IndexError:
+				pass
+		except XPathError:
+			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
+		except TypeError:
+			raise Exception("Selector expression string to be provided. Got " + selector)
+
+		return result_list
diff --git a/scrapple/selectors/xpath.py b/scrapple/selectors/xpath.py
index cae3dea..243c196 100644
--- a/scrapple/selectors/xpath.py
+++ b/scrapple/selectors/xpath.py
@@ -6,8 +6,6 @@
 
 from __future__ import print_function
 
-from lxml.etree import XPathError
-
 from scrapple.selectors.selector import Selector
 from scrapple.utils.text import make_ascii
 
@@ -29,135 +27,8 @@ def __init__(self, url):
 		super(XpathSelector, self).__init__(url)
 
 
-	def get_selected_tag(self, selector='', *args, **kwargs):
-		tag = self.tree.xpath(selector)[0]
-		return tag
-
-
-	def get_links_for_crawling(self, selector='', *args, **kwargs):
-		links = self.tree.xpath(selector)
-		return links
-
-
-	def extract_rows(self, *args, **kwargs):
-		"""
-		Row data extraction for extract_tabular
-		"""
-		result_list = []
-		result = kwargs.get('result', {})
-
-		try:
-			values = self.tree.xpath(kwargs.get('selector', ''))
-			if len(kwargs.get('table_headers', [])) >= len(values):
-				from itertools import izip_longest
-				pairs = izip_longest(kwargs.get('table_headers', []), values, fillvalue=kwargs.get('default', ''))
-			else:
-				from itertools import izip
-				pairs = izip(kwargs.get('table_headers', []), values)
-			for head, val in pairs:
-				if kwargs.get('verbosity', 0) > 1:
-					print("\nExtracting", head, "attribute", sep=' ', end='')
-				if kwargs.get('attr', 'text') == "text":
-					try:
-						content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in val.itertext()])
-					except Exception:
-						content = kwargs.get('default', '')
-					content = content.replace("\n", " ").strip()
-				else:
-					content = val.get(kwargs.get('attr', 'text'))
-					if kwargs.get('attr', 'text') in ["href", "src"]:
-						content = urljoin(self.url, content)
-				result[head] = content
-			result_list.append(result)
-		except XPathError:
-			raise Exception("Invalid XPath selector " + kwargs.get('selector', ''))
-		except TypeError:
-			raise Exception("Selector expression string to be provided. Got " + kwargs.get('selector', ''))
-
-		return result_list
-
-
-	def extract_columns(self, *args, **kwargs):
-		"""
-		Column data extraction for extract_tabular
-		"""
-		result_list = []
-		result = kwargs.get('result', {})
-
-		try:
-			if type(kwargs.get('selector', '')) in [str, unicode]:
-				selectors = [kwargs.get('selector', '')]
-			elif type(kwargs.get('selector', '')) == list:
-				selectors = kwargs.get('selector', '')
-			else:
-				raise Exception("Use a list of selector expressions for the various columns")
-			from itertools import izip, count
-			pairs = izip(kwargs.get('table_headers', []), selectors)
-			columns = {}
-			for head, selector in pairs:
-				columns[head] = self.tree.xpath(selector)
-			try:
-				for i in count(start=0):
-					r = result.copy()
-					for head in columns.keys():
-						if kwargs.get('verbosity', 0) > 1:
-							print("\nExtracting", head, "attribute", sep=' ', end='')
-						col = columns[head][i]
-						if kwargs.get('attr', 'text') == "text":
-							try:
-								content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in col.itertext()])
-							except Exception:
-								content = kwargs.get('default', '')
-							content = content.replace("\n", " ").strip()
-						else:
-							content = col.get(kwargs.get('attr', 'text'))
-							if kwargs.get('attr', 'text') in ["href", "src"]:
-								content = urljoin(self.url, content)
-						r[head] = content
-					result_list.append(r)
-			except IndexError:
-				pass
-		except XPathError:
-			raise Exception("Invalid XPath selector " + selector)
-		except TypeError:
-			raise Exception("Selector expression string to be provided. Got " + selector)
-
-		return result_list
-
-
-	def extract_tabular(self, *args, **kwargs):
-		"""
-		Method for performing the extraction of tabular data.
-
-		:param result: A dictionary containing the extracted data so far
-		:param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \
-		A row extraction is when there is a single row to be extracted and mapped to a set of headers. \
-		A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings.
-		:param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers
-		:param prefix: A prefix to be added to each header
-		:param suffix: A suffix to be added to each header
-		:param selector: For row extraction, this is a selector that gives the row to be extracted. \
-		For column extraction, this is a list of selectors for each column.
-		:param attr: The attribute to be extracted from the selected tag
-		:param default: The default value to be used if the selector does not return any data
-		:param verbosity: The verbosity set as the argument for scrapple run
-		:return: A 2-tuple containing the list of all the column headers extracted and the list of \
-		dictionaries which contain (header, content) pairs
-		"""
-		result = kwargs.get('result', {})
-		if type(kwargs.get('header', [])) in [str, unicode]:
-			try:
-				header_list = self.tree.xpath(kwargs.get('header', []))
-				table_headers = [kwargs.get('prefix', '') + h.text + kwargs.get('suffix', '') for h in header_list]
-			except XPathError:
-				raise Exception("Invalid XPath selector " + kwargs.get('header', []))
-		else:
-			table_headers = [kwargs.get('prefix', '') + h + kwargs.get('suffix', '') for h in kwargs.get('header', [])]
-		if kwargs.get('table_type', 'rows') not in ["rows", "columns"]:
-			raise Exception("Specify 'rows' or 'columns' in table_type")
-		kwargs.update({'table_headers': table_headers})
-		if kwargs.get('table_type', 'rows') == "rows":
-			result_list = self.extract_rows(**kwargs)
-		else:
-			result_list = self.extract_columns(**kwargs)
-		return table_headers, result_list
+	def get_tree_tag(self, selector='', get_one=False, *args, **kwargs):
+		tags = self.tree.xpath(selector)
+		if get_one:
+			return tags[0]
+		return tags

From 46f88b9702a581da5167c43368045198e9f26b51 Mon Sep 17 00:00:00 2001
From: Alex Mathew <alexmathew003@gmail.com>
Date: Wed, 25 Apr 2018 13:18:33 +0530
Subject: [PATCH 6/6] Fix minor issues and update tests

---
 scrapple/commands/run.py             |  2 +-
 scrapple/selectors/css.py            |  4 +-
 scrapple/selectors/selector.py       |  2 +-
 tests/expected_result2.json          |  1 -
 tests/expected_result2_20180428.json | 95 ++++++++++++++++++++++++++++
 tests/expected_result3.json          |  2 +-
 tests/project2.json                  | 56 ++++++++--------
 tests/project3.json                  | 25 +++-----
 tests/test_generate.py               |  2 +-
 tests/test_run.py                    |  2 +-
 10 files changed, 136 insertions(+), 55 deletions(-)
 delete mode 100644 tests/expected_result2.json
 create mode 100644 tests/expected_result2_20180428.json

diff --git a/scrapple/commands/run.py b/scrapple/commands/run.py
index 50a54e1..bbbe13d 100644
--- a/scrapple/commands/run.py
+++ b/scrapple/commands/run.py
@@ -128,7 +128,7 @@ def run(self):
                 import json
                 with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                     'w') as f:
-                    json.dump(results, f, indent=3)
+                    json.dump(results, f, indent=4)
             elif self.args['--output_type'] == 'csv':
                 import csv
                 with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
diff --git a/scrapple/selectors/css.py b/scrapple/selectors/css.py
index 12155d1..fb26a1d 100644
--- a/scrapple/selectors/css.py
+++ b/scrapple/selectors/css.py
@@ -29,9 +29,9 @@ def __init__(self, url):
 		super(CssSelector, self).__init__(url)
 
 
-	def get_selected_tag(self, selector='', get_one=False, *args, **kwargs):
+	def get_tree_tag(self, selector='', get_one=False, *args, **kwargs):
 		sel = cssselect.CSSSelector(selector)
 		tags = sel(self.tree)
 		if get_one:
 			return tags[0]
-		return tag
+		return tags
diff --git a/scrapple/selectors/selector.py b/scrapple/selectors/selector.py
index 85a388a..1f824d3 100644
--- a/scrapple/selectors/selector.py
+++ b/scrapple/selectors/selector.py
@@ -106,7 +106,7 @@ def extract_content(self, selector='', attr='', default='', connector='', *args,
 				content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
 				content = content.replace("\n", " ").strip()
 			else:
-				tag = self.get_tree_tag(selector=selector)
+				tag = self.get_tree_tag(selector=selector, get_one=True)
 				content = tag.get(attr)
 				if attr in ["href", "src"]:
 					content = urljoin(self.url, content)
diff --git a/tests/expected_result2.json b/tests/expected_result2.json
deleted file mode 100644
index 55e1365..0000000
--- a/tests/expected_result2.json
+++ /dev/null
@@ -1 +0,0 @@
-{"project": "project2", "data": [{"event": "Event: Boston Python Meetup", "talk": "How to test the hard stuff"}, {"event": "Event: Boston Python Meetup", "talk": "Testing: Where do I start?"}]}
\ No newline at end of file
diff --git a/tests/expected_result2_20180428.json b/tests/expected_result2_20180428.json
new file mode 100644
index 0000000..bdeac7b
--- /dev/null
+++ b/tests/expected_result2_20180428.json
@@ -0,0 +1,95 @@
+{
+    "project": "project2", 
+    "data": [
+        {
+            "team": "Atlanta Hawks"
+        }, 
+        {
+            "team": "Boston Celtics"
+        }, 
+        {
+            "team": "Brooklyn Nets"
+        }, 
+        {
+            "team": "Charlotte Hornets"
+        }, 
+        {
+            "team": "Chicago Bulls"
+        }, 
+        {
+            "team": "Cleveland Cavaliers"
+        }, 
+        {
+            "team": "Dallas Mavericks"
+        }, 
+        {
+            "team": "Denver Nuggets"
+        }, 
+        {
+            "team": "Detroit Pistons"
+        }, 
+        {
+            "team": "Golden State Warriors"
+        }, 
+        {
+            "team": "Houston Rockets"
+        }, 
+        {
+            "team": "Indiana Pacers"
+        }, 
+        {
+            "team": "Los Angeles Clippers"
+        }, 
+        {
+            "team": "Los Angeles Lakers"
+        }, 
+        {
+            "team": "Memphis Grizzlies"
+        }, 
+        {
+            "team": "Miami Heat"
+        }, 
+        {
+            "team": "Milwaukee Bucks"
+        }, 
+        {
+            "team": "Minnesota Timberwolves"
+        }, 
+        {
+            "team": "New Orleans Pelicans"
+        }, 
+        {
+            "team": "New York Knicks"
+        }, 
+        {
+            "team": "Oklahoma City Thunder"
+        }, 
+        {
+            "team": "Orlando Magic"
+        }, 
+        {
+            "team": "Philadelphia 76ers"
+        }, 
+        {
+            "team": "Phoenix Suns"
+        }, 
+        {
+            "team": "Portland Trail Blazers"
+        }, 
+        {
+            "team": "Sacramento Kings"
+        }, 
+        {
+            "team": "San Antonio Spurs"
+        }, 
+        {
+            "team": "Toronto Raptors"
+        }, 
+        {
+            "team": "Utah Jazz"
+        }, 
+        {
+            "team": "Washington Wizards"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/tests/expected_result3.json b/tests/expected_result3.json
index caada20..b1acc98 100644
--- a/tests/expected_result3.json
+++ b/tests/expected_result3.json
@@ -1 +1 @@
-{"project": "project3", "data": [{"unknown": "<unknown>", "speaker": "Kenneth Reitz", "talk_url": "http://pyvideo.org/pycon-us-2013/python-for-humans-1.html", "title": "Python for Humans"}]}
\ No newline at end of file
+{"project": "project3", "data": [{"show_url": "https://trakt.tv/shows/mr-robot", "year": "2015", "unknown": "<unknown>"}]}
\ No newline at end of file
diff --git a/tests/project2.json b/tests/project2.json
index 1c55aeb..16edcbf 100644
--- a/tests/project2.json
+++ b/tests/project2.json
@@ -1,32 +1,26 @@
 {
-	"project_name": "new_project2",
-	"selector_type": "css",
-	"scraping": {
-		"url": "http://pyvideo.org/events/boston-python-meetup.html",
-		"data": [
-			{
-				"field": "event",
-				"selector": "h2",
-				"attr": "text",
-				"connector": "",
-				"default": "<event>"
-			}
-		],
-		"next": [
-			{
-				"follow_link": "div.content-list div.row h4.entry-title a",
-				"scraping": {
-					"data": [
-						{
-							"field": "talk",
-							"selector": "h2",
-							"attr": "text",
-							"connector": "",
-							"default": "<talk>"
-						}
-					]
-				}
-			}
-		]
-	}
-}
\ No newline at end of file
+    "project_name": "project2_nba",
+    "selector_type": "css",
+    "scraping": {
+        "url": "https://www.basketball-reference.com/teams/",
+        "data": [
+
+        ],
+        "next": [
+            {
+                "follow_link": "#teams_active th > a",
+                "scraping": {
+                    "data": [
+                        {
+                            "field": "team",
+                            "selector": "div#info h1",
+                            "attr": "text",
+                            "default": "<no_team>",
+                            "connector": ""
+                        }
+                    ]
+                }
+            }
+        ]
+    }
+}
diff --git a/tests/project3.json b/tests/project3.json
index 96e7fbb..3912626 100644
--- a/tests/project3.json
+++ b/tests/project3.json
@@ -2,35 +2,28 @@
 	"project_name": "project3",
 	"selector_type": "css",
 	"scraping": {
-		"url": "http://pyvideo.org/pycon-us-2013/python-for-humans-1.html",
+		"url": "https://trakt.tv/shows/mr-robot",
 		"data": [
 			{
-				"field": "unknown",
-				"selector": "h1",
-				"attr": "text",
-				"connector": "",
-				"default": "<unknown>"
-			},
-			{
-				"field": "talk_url",
+				"field": "show_url",
 				"selector": "url",
 				"attr": "",
 				"connector": "",
-				"default": "<talk_url>"
+				"default": "<url>"
 			},
 			{
-				"field": "title",
-				"selector": "h2",
+				"field": "unknown",
+				"selector": "h6",
 				"attr": "text",
 				"connector": "",
-				"default": "<title>"
+				"default": "<unknown>"
 			},
 			{
-				"field": "speaker",
-				"selector": ".author a",
+				"field": "year",
+				"selector": "span.year",
 				"attr": "text",
 				"connector": "",
-				"default": "<speaker>"
+				"default": "<year>"
 			}
 		]
 	}
diff --git a/tests/test_generate.py b/tests/test_generate.py
index dbfb38f..ab18c2f 100644
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@@ -51,7 +51,7 @@ def test_css_scraper_generate():
 	with open(os.path.join(os.getcwd(), 'project2.py'), 'r') as f:
 		program = f.read()
 	assert_in("from scrapple.selectors.css import CssSelector", program)
-	assert_in('page0 = CssSelector("http://pyvideo.org/events/boston-python-meetup.html")', program)
+	assert_in('page0 = CssSelector("https://www.basketball-reference.com/teams/")', program)
 
 
 def test_nonexistent_project():
diff --git a/tests/test_run.py b/tests/test_run.py
index 27e19f6..39cb5d7 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -62,7 +62,7 @@ def test_run_css_crawler():
 	rc.execute_command()
 	with open(os.path.join(os.getcwd(), 'result2.json'), 'r') as f:
 		result = json.load(f)
-	with open(os.path.join(os.getcwd(), 'expected_result2.json'), 'r') as f:
+	with open(os.path.join(os.getcwd(), 'expected_result2_20180428.json'), 'r') as f:
 		expected_result = json.load(f)
 	assert_dict_equal(result, expected_result)