Merge pull request #106 from AlexMathew/cleanup_selector_classes

Refactor selector classes
AlexMathew · Apr 25, 2018 · 336d19c · 336d19c
2 parents 6624a92 + 46f88b9
commit 336d19c
Show file tree

Hide file tree

Showing 13 changed files with 359 additions and 476 deletions.
diff --git a/docs/implementation/selectors.rst b/docs/implementation/selectors.rst
@@ -15,6 +15,11 @@ These selector types are implemented through the ``XpathSelector`` and ``CssSele
 In the super class, the URL of the web page to be loaded is validated - ensuring the schema has been specified, and that the URL is valid. A HTTP GET request is made to load the web page, and the HTML content of this fetched web page is used to generate the :ref:`element tree <concepts-structure>`. This is the element tree that will be parsed to extract the necessary content.
 
 
+.. automodule:: scrapple.selectors.selector
+
+.. autoclass:: scrapple.selectors.selector.Selector
+   :members:
+
 .. automodule:: scrapple.selectors.xpath
 
 .. autoclass:: scrapple.selectors.xpath.XpathSelector

diff --git a/scrapple/commands/run.py b/scrapple/commands/run.py
@@ -11,7 +11,8 @@
 from colorama import Back, Fore, init
 
 from scrapple.commands import command
-from scrapple.selectors import css, xpath
+from scrapple.selectors.css import CssSelector
+from scrapple.selectors.xpath import XpathSelector
 from scrapple.utils.config import (InvalidConfigException, extract_fieldnames,
                                    traverse_next, validate_config)
 
@@ -74,10 +75,11 @@ def execute_command(self):
 
 
     def run(self):
-        selectorClass = getattr(
-                eval(self.config['selector_type']), 
-                self.config['selector_type'].title() + 'Selector'
-                )
+        selectorClassMapping = {
+            'xpath': XpathSelector,
+            'css': CssSelector
+        }
+        selectorClass = selectorClassMapping.get(self.config['selector_type'].lower())
         results = dict()
         results['project'] = self.args['<projectname>']
         results['data'] = list()
@@ -126,7 +128,7 @@ def run(self):
                 import json
                 with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                     'w') as f:
-                    json.dump(results, f, indent=3)
+                    json.dump(results, f, indent=4)
             elif self.args['--output_type'] == 'csv':
                 import csv
                 with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \

diff --git a/scrapple/selectors/css.py b/scrapple/selectors/css.py
@@ -11,18 +11,16 @@
 from scrapple.selectors.selector import Selector
 from scrapple.utils.text import make_ascii
 
-try:
-	from urlparse import urljoin
-except ImportError:
-	from urllib.parse import urljoin
-
 
 class CssSelector(Selector):
 	"""
 	The ``CssSelector`` object defines CSS selector expressions.
 
 	"""
 
+	__selector_type__ = 'CSS'
+
+
 	def __init__(self, url):
 		"""
 		The ``Selector`` class acts as the super class for this class.
@@ -31,207 +29,9 @@ def __init__(self, url):
 		super(CssSelector, self).__init__(url)
 
 
-	def extract_content(self, *args, **kwargs):
-		"""
-		Method for performing the content extraction for the given CSS selector.
-
-		The cssselect library is used to handle CSS selector expressions. \
-		XPath expressions have a higher speed of execution, so the given CSS selector \
-		expression is translated into the corresponding XPath expression, by the \
-		``cssselect.CSSSelector`` class. This selector can be used to extract content \
-		from the element tree corresponding to the fetched web page.
-
-		If the selector is "url", the URL of the current web page is returned.
-		Otherwise, the selector expression is used to extract content. The particular \
-		attribute to be extracted ("text", "href", etc.) is specified in the method \
-		arguments, and this is used to extract the required content. If the content \
-		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
-		to convert the relative path into an absolute path.
-
-		If the selector does not fetch any content, the default value is returned. \
-		If no default value is specified, an exception is raised.
-
-		:param selector: The CSS selector expression
-		:param attr: The attribute to be extracted from the selected tag
-		:param default: The default value to be used if the selector does not return any data
-		:return: The extracted content
-
-		"""
-		try:
-			selector, attr, default, connector = [kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector']]
-			if selector == "url":
-				return self.url
-			sel = cssselect.CSSSelector(selector)
-			if attr == "text":
-				tag = sel(self.tree)[0]
-				content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
-				content = content.replace("\n", " ").strip()				
-			else:
-				content = sel(self.tree)[0].get(attr)
-				if attr in ["href", "src"]:
-					content = urljoin(self.url, content)
-			return content
-		except IndexError:
-			if default is not "":
-				return default
-			raise Exception("There is no content for the selector " + selector)
-
-
-	def extract_links(self, *args, **kwargs):
-		"""
-		Method for performing the link extraction for the crawler implementation.
-
-		As in the extract_content method, the cssselect library is used to translate \
-		the CSS selector expression into an XPath expression. 
-
-		The selector passed as the argument is a selector to point to the anchor tags \
-		that the crawler should pass through. A list of links is obtained, and the links \
-		are iterated through. The relative paths are converted into absolute paths and \
-		a ``CssSelector`` object is created with the URL of the next page as the argument \
-		and this created object is yielded. 
-
-		The extract_links method basically generates ``CssSelector`` objects for all of \
-		the links to be crawled through.
-
-		:param selector: The selector for the anchor tags to be crawled through
-		:return: A ``CssSelector`` object for every page to be crawled through 
-		
-		"""
-		try:
-			selector = kwargs.get('selector', '')
-			sel = cssselect.CSSSelector(selector)
-			links = sel(self.tree)
-			for link in links:
-				next_url = urljoin(self.url, link.get('href'))
-				yield CssSelector(next_url)
-		except Exception:
-			raise Exception("Invalid CSS selector " + selector)
-
-
-	def extract_rows(self, *args, **kwargs):
-		"""
-		Row data extraction for extract_tabular
-		"""
-		result_list = []
-		result = kwargs.get('result', {})
-
-		try:
-			sel = cssselect.CSSSelector(kwargs.get('selector', ''))
-			values = sel(self.tree)
-			if len(kwargs.get('table_headers', [])) >= len(values):
-				from itertools import izip_longest
-				pairs = izip_longest(kwargs.get('table_headers', []), values, fillvalue=kwargs.get('default', ''))
-			else:
-				from itertools import izip
-				pairs = izip(kwargs.get('table_headers', []), values)
-			for head, val in pairs:
-				if kwargs.get('verbosity', 0) > 1:
-					print("\nExtracting", head, "attribute", sep=' ', end='')
-				if kwargs.get('attr', 'text') == "text":
-					try:
-						content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in val.itertext()])
-					except Exception:
-						content = kwargs.get('default', '')
-					content = content.replace("\n", " ").strip()
-				else:
-					content = val.get(kwargs.get('attr', 'text'))
-					if kwargs.get('attr', 'text') in ["href", "src"]:
-						content = urljoin(self.url, content)
-				result[head] = content
-			result_list.append(result)
-		except TypeError:
-			raise Exception("Selector expression string to be provided. Got " + kwargs.get('selector', ''))
-
-		return result_list
-
-
-	def extract_columns(self, *args, **kwargs):
-		"""
-		Column data extraction for extract_tabular
-		"""
-		result_list = []
-		result = kwargs.get('result', {})
-
-		try:
-			if type(kwargs.get('selector', '')) in [str, unicode]:
-				selectors = [kwargs.get('selector', '')]
-			elif type(kwargs.get('selector', '')) == list:
-				selectors = kwargs.get('selector', '')
-			else:
-				raise Exception("Use a list of selector expressions for the various columns")
-			from itertools import izip, count
-			pairs = izip(kwargs.get('table_headers', []), selectors)
-			columns = {}
-			for head, selector in pairs:
-				sel = cssselect.CSSSelector(selector)
-				columns[head] = sel(self.tree)
-			try:
-				for i in count(start=0):
-					r = result.copy()
-					for head in columns.keys():
-						if kwargs.get('verbosity', 0) > 1:
-							print("\nExtracting", head, "attribute", sep=' ', end='')
-						col = columns[head][i]
-						if kwargs.get('attr', 'text') == "text":
-							try:
-								content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in col.itertext()])
-							except Exception:
-								content = kwargs.get('default', '')
-							content = content.replace("\n", " ").strip()
-						else:
-							content = col.get(kwargs.get('attr', 'text'))
-							if kwargs.get('attr', 'text') in ["href", "src"]:
-								content = urljoin(self.url, content)
-						r[head] = content
-					result_list.append(r)
-			except IndexError:
-				pass
-		except TypeError:
-			raise Exception("Selector expression string to be provided. Got " + selector)
-
-		return result_list
-
-
-	def extract_tabular(self, *args, **kwargs):
-		"""
-		Method for performing the extraction of tabular data.
-
-		As in the extract_content method, the cssselect library is used to translate \
-		the CSS selector expression into an XPath expression. 
-
-		:param result: A dictionary containing the extracted data so far
-		:param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \
-		A row extraction is when there is a single row to be extracted and mapped to a set of headers. \
-		A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings.
-		:param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers
-		:param prefix: A prefix to be added to each header
-		:param suffix: A suffix to be added to each header
-		:param selector: For row extraction, this is a selector that gives the row to be extracted. \
-		For column extraction, this is a list of selectors for each column.
-		:param attr: The attribute to be extracted from the selected tag
-		:param default: The default value to be used if the selector does not return any data
-		:param verbosity: The verbosity set as the argument for scrapple run
-		:return: A 2-tuple containing the list of all the column headers extracted and the list of \
-		dictionaries which contain (header, content) pairs
-		"""
-		result = kwargs.get('result', {})
-		result_list = []
-		if type(kwargs.get('header', [])) in [str, unicode]:
-			try:
-				sel = cssselect.CSSSelector(kwargs.get('header', []))
-				header_list = sel(self.tree)
-				table_headers = [kwargs.get('prefix', '') + h.text + kwargs.get('suffix', '') for h in header_list]
-				if len(table_headers) == 0:
-					raise Exception("Invalid CSS selector " + kwargs.get('header', []))
-			except TypeError:
-				raise Exception("Selector expression string to be provided. Got " + kwargs.get('header', []))
-		else:
-			table_headers = [kwargs.get('prefix', '') + h + kwargs.get('suffix', '') for h in kwargs.get('header', [])]
-		if kwargs.get('table_type', 'rows') not in ["rows", "columns"]:
-			raise Exception("Specify 'rows' or 'columns' in table_type")
-		kwargs.update({'table_headers': table_headers})
-		if kwargs.get('table_type', 'rows') == "rows":
-			result_list = self.extract_rows(**kwargs)
-		else:
-			result_list = self.extract_columns(**kwargs)
-		return table_headers, result_list
+	def get_tree_tag(self, selector='', get_one=False, *args, **kwargs):
+		sel = cssselect.CSSSelector(selector)
+		tags = sel(self.tree)
+		if get_one:
+			return tags[0]
+		return tags