From 4aeaf9d5884812b31c272a9b3b52b0cf1d2faa38 Mon Sep 17 00:00:00 2001 From: Alex Mathew Date: Fri, 20 Apr 2018 10:55:19 +0530 Subject: [PATCH 1/6] Remove uses of eval (how did I not know that was bad?) --- scrapple/commands/run.py | 12 +++++++----- scrapple/utils/dynamicdispatch.py | 8 +++++++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/scrapple/commands/run.py b/scrapple/commands/run.py index b938561..50a54e1 100644 --- a/scrapple/commands/run.py +++ b/scrapple/commands/run.py @@ -11,7 +11,8 @@ from colorama import Back, Fore, init from scrapple.commands import command -from scrapple.selectors import css, xpath +from scrapple.selectors.css import CssSelector +from scrapple.selectors.xpath import XpathSelector from scrapple.utils.config import (InvalidConfigException, extract_fieldnames, traverse_next, validate_config) @@ -74,10 +75,11 @@ def execute_command(self): def run(self): - selectorClass = getattr( - eval(self.config['selector_type']), - self.config['selector_type'].title() + 'Selector' - ) + selectorClassMapping = { + 'xpath': XpathSelector, + 'css': CssSelector + } + selectorClass = selectorClassMapping.get(self.config['selector_type'].lower()) results = dict() results['project'] = self.args[''] results['data'] = list() diff --git a/scrapple/utils/dynamicdispatch.py b/scrapple/utils/dynamicdispatch.py index d68bc22..fdb67a6 100644 --- a/scrapple/utils/dynamicdispatch.py +++ b/scrapple/utils/dynamicdispatch.py @@ -13,5 +13,11 @@ def get_command_class(command): :return: The command class corresponding to the selected command """ from scrapple.commands import genconfig, generate, run, web - cmdClass = getattr(eval(command), command.title() + 'Command') + commandMapping = { + 'genconfig': genconfig, + 'generate': generate, + 'run': run, + 'web': web + } + cmdClass = getattr(commandMapping.get(command), command.title() + 'Command') return cmdClass From bfee6f7b58fdf56012e27dde22bd83273ab647cd Mon Sep 17 00:00:00 2001 From: Alex Mathew Date: Fri, 20 Apr 2018 11:12:13 +0530 Subject: [PATCH 2/6] Refactor extract_content and extract_links in the selector classes --- scrapple/selectors/css.py | 89 +++++----------------------------- scrapple/selectors/selector.py | 75 ++++++++++++++++++++++++++-- scrapple/selectors/xpath.py | 81 ++++--------------------------- 3 files changed, 91 insertions(+), 154 deletions(-) diff --git a/scrapple/selectors/css.py b/scrapple/selectors/css.py index 350cdce..e2b2740 100644 --- a/scrapple/selectors/css.py +++ b/scrapple/selectors/css.py @@ -11,11 +11,6 @@ from scrapple.selectors.selector import Selector from scrapple.utils.text import make_ascii -try: - from urlparse import urljoin -except ImportError: - from urllib.parse import urljoin - class CssSelector(Selector): """ @@ -23,6 +18,9 @@ class CssSelector(Selector): """ + __selector_type__ = 'CSS' + + def __init__(self, url): """ The ``Selector`` class acts as the super class for this class. @@ -31,81 +29,16 @@ def __init__(self, url): super(CssSelector, self).__init__(url) - def extract_content(self, *args, **kwargs): - """ - Method for performing the content extraction for the given CSS selector. - - The cssselect library is used to handle CSS selector expressions. \ - XPath expressions have a higher speed of execution, so the given CSS selector \ - expression is translated into the corresponding XPath expression, by the \ - ``cssselect.CSSSelector`` class. This selector can be used to extract content \ - from the element tree corresponding to the fetched web page. - - If the selector is "url", the URL of the current web page is returned. - Otherwise, the selector expression is used to extract content. The particular \ - attribute to be extracted ("text", "href", etc.) is specified in the method \ - arguments, and this is used to extract the required content. If the content \ - extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ - to convert the relative path into an absolute path. + def get_selected_tag(self, selector='', *args, **kwargs): + sel = cssselect.CSSSelector(selector) + tag = sel(self.tree)[0] + return tag - If the selector does not fetch any content, the default value is returned. \ - If no default value is specified, an exception is raised. - :param selector: The CSS selector expression - :param attr: The attribute to be extracted from the selected tag - :param default: The default value to be used if the selector does not return any data - :return: The extracted content - - """ - try: - selector, attr, default, connector = [kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector']] - if selector == "url": - return self.url - sel = cssselect.CSSSelector(selector) - if attr == "text": - tag = sel(self.tree)[0] - content = connector.join([make_ascii(x).strip() for x in tag.itertext()]) - content = content.replace("\n", " ").strip() - else: - content = sel(self.tree)[0].get(attr) - if attr in ["href", "src"]: - content = urljoin(self.url, content) - return content - except IndexError: - if default is not "": - return default - raise Exception("There is no content for the selector " + selector) - - - def extract_links(self, *args, **kwargs): - """ - Method for performing the link extraction for the crawler implementation. - - As in the extract_content method, the cssselect library is used to translate \ - the CSS selector expression into an XPath expression. - - The selector passed as the argument is a selector to point to the anchor tags \ - that the crawler should pass through. A list of links is obtained, and the links \ - are iterated through. The relative paths are converted into absolute paths and \ - a ``CssSelector`` object is created with the URL of the next page as the argument \ - and this created object is yielded. - - The extract_links method basically generates ``CssSelector`` objects for all of \ - the links to be crawled through. - - :param selector: The selector for the anchor tags to be crawled through - :return: A ``CssSelector`` object for every page to be crawled through - - """ - try: - selector = kwargs.get('selector', '') - sel = cssselect.CSSSelector(selector) - links = sel(self.tree) - for link in links: - next_url = urljoin(self.url, link.get('href')) - yield CssSelector(next_url) - except Exception: - raise Exception("Invalid CSS selector " + selector) + def get_links_for_crawling(self, selector='', *args, **kwargs): + sel = cssselect.CSSSelector(selector) + links = sel(self.tree) + return links def extract_rows(self, *args, **kwargs): diff --git a/scrapple/selectors/selector.py b/scrapple/selectors/selector.py index bf4c569..27f25ca 100644 --- a/scrapple/selectors/selector.py +++ b/scrapple/selectors/selector.py @@ -11,6 +11,12 @@ import requests from lxml import etree +try: + from urlparse import urljoin +except ImportError: + from urllib.parse import urljoin + + requests.warnings.filterwarnings('ignore') @@ -19,7 +25,9 @@ class Selector(object): This class defines the basic ``Selector`` object. """ + __selector_type__ = '' + def __init__(self, url): """ The URL of the web page to be loaded is validated - ensuring the schema has \ @@ -63,21 +71,80 @@ def __init__(self, url): raise Exception('Ensure that you are connected to the Internet and that the page exists') - def extract_content(self, *args, **kwargs): + def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs): """ Method for performing the content extraction for the particular selector type. \ - A detailed description is provided in the derived classes. + If the selector is "url", the URL of the current web page is returned. + Otherwise, the selector expression is used to extract content. The particular \ + attribute to be extracted ("text", "href", etc.) is specified in the method \ + arguments, and this is used to extract the required content. If the content \ + extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ + to convert the relative path into an absolute path. + + If the selector does not fetch any content, the default value is returned. \ + If no default value is specified, an exception is raised. + + :param selector: The XPath expression + :param attr: The attribute to be extracted from the selected tag + :param default: The default value to be used if the selector does not return any data + :param connector: String connector for list of data returned for a particular selector + :return: The extracted content """ + try: + if selector == "url": + return self.url + if attr == "text": + tag = self.get_selected_tag(selector=selector) + content = connector.join([make_ascii(x).strip() for x in tag.itertext()]) + content = content.replace("\n", " ").strip() + else: + tag = self.get_selected_tag(selector=selector) + content = tag.get(attr) + if attr in ["href", "src"]: + content = urljoin(self.url, content) + return content + except IndexError: + if default is not "": + return default + raise Exception("There is no content for the %s selector - %s" % (self.__selector_type__, selector)) + except XPathError: + raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) + + + def get_selected_tag(self, selector='', *args, **kwargs): raise NotImplementedError - def extract_links(self, *args, **kwargs): + def extract_links(self, selector='', *args, **kwargs): """ Method for performing the link extraction for the crawler. \ - A detailed description is provided in the derived classes. + The selector passed as the argument is a selector to point to the anchor tags \ + that the crawler should pass through. A list of links is obtained, and the links \ + are iterated through. The relative paths are converted into absolute paths and \ + a ``XpathSelector``/``CssSelector`` object (as is the case) is created with the URL of the next page as the argument \ + and this created object is yielded. + + The extract_links method basically generates ``XpathSelector``/``CssSelector`` objects for all of \ + the links to be crawled through. + + :param selector: The selector for the anchor tags to be crawled through + :return: A ``XpathSelector``/``CssSelector`` object for every page to be crawled through + """ + try: + links = self.get_links_for_crawling(selector=selector) + for link in links: + next_url = urljoin(self.url, link.get('href')) + yield type(self)(next_url) + except XPathError: + raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) + except Exception: + raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) + + + def get_links_for_crawling(self, selector='', *args, **kwargs): raise NotImplementedError diff --git a/scrapple/selectors/xpath.py b/scrapple/selectors/xpath.py index e92b31c..cae3dea 100644 --- a/scrapple/selectors/xpath.py +++ b/scrapple/selectors/xpath.py @@ -11,11 +11,6 @@ from scrapple.selectors.selector import Selector from scrapple.utils.text import make_ascii -try: - from urlparse import urljoin -except ImportError: - from urllib.parse import urljoin - class XpathSelector(Selector): """ @@ -23,6 +18,9 @@ class XpathSelector(Selector): """ + __selector_type__ = 'XPath' + + def __init__(self, url): """ The ``Selector`` class acts as the super class for this class. @@ -31,75 +29,14 @@ def __init__(self, url): super(XpathSelector, self).__init__(url) - def extract_content(self, *args, **kwargs): - """ - Method for performing the content extraction for the given XPath expression. - - The XPath selector expression can be used to extract content \ - from the element tree corresponding to the fetched web page. - - If the selector is "url", the URL of the current web page is returned. - Otherwise, the selector expression is used to extract content. The particular \ - attribute to be extracted ("text", "href", etc.) is specified in the method \ - arguments, and this is used to extract the required content. If the content \ - extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ - to convert the relative path into an absolute path. - - If the selector does not fetch any content, the default value is returned. \ - If no default value is specified, an exception is raised. - - :param selector: The XPath expression - :param attr: The attribute to be extracted from the selected tag - :param default: The default value to be used if the selector does not return any data - :return: The extracted content - - """ - try: - selector, attr, default, connector = [kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector']] - if selector == "url": - return self.url - if attr == "text": - tag = self.tree.xpath(selector)[0] - content = connector.join([make_ascii(x).strip() for x in tag.itertext()]) - content = content.replace("\n", " ").strip() - else: - content = self.tree.xpath(selector)[0].get(attr) - if attr in ["href", "src"]: - content = urljoin(self.url, content) - return content - except IndexError: - if default is not "": - return default - raise Exception("There is no content for the selector " + selector) - except XPathError: - raise Exception("Invalid XPath selector " + selector) - - - def extract_links(self, *args, **kwargs): - """ - Method for performing the link extraction for the crawler. - - The selector passed as the argument is a selector to point to the anchor tags \ - that the crawler should pass through. A list of links is obtained, and the links \ - are iterated through. The relative paths are converted into absolute paths and \ - a ``XpathSelector`` object is created with the URL of the next page as the argument \ - and this created object is yielded. + def get_selected_tag(self, selector='', *args, **kwargs): + tag = self.tree.xpath(selector)[0] + return tag - The extract_links method basically generates ``XpathSelector`` objects for all of \ - the links to be crawled through. - :param selector: The selector for the anchor tags to be crawled through - :return: A ``XpathSelector`` object for every page to be crawled through - - """ - try: - selector = kwargs.get('selector', '') - links = self.tree.xpath(selector) - for link in links: - next_url = urljoin(self.url, link.get('href')) - yield XpathSelector(next_url) - except XPathError: - raise Exception("Invalid XPath selector " + selector) + def get_links_for_crawling(self, selector='', *args, **kwargs): + links = self.tree.xpath(selector) + return links def extract_rows(self, *args, **kwargs): From 802e01cc032b38cff0faa08d0ede7ebf2be3f7c3 Mon Sep 17 00:00:00 2001 From: Alex Mathew Date: Fri, 20 Apr 2018 11:13:23 +0530 Subject: [PATCH 3/6] Add base selector class in docs --- docs/implementation/selectors.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/implementation/selectors.rst b/docs/implementation/selectors.rst index ac90e38..45be37f 100644 --- a/docs/implementation/selectors.rst +++ b/docs/implementation/selectors.rst @@ -15,6 +15,11 @@ These selector types are implemented through the ``XpathSelector`` and ``CssSele In the super class, the URL of the web page to be loaded is validated - ensuring the schema has been specified, and that the URL is valid. A HTTP GET request is made to load the web page, and the HTML content of this fetched web page is used to generate the :ref:`element tree `. This is the element tree that will be parsed to extract the necessary content. +.. automodule:: scrapple.selectors.selector + +.. autoclass:: scrapple.selectors.selector.Selector + :members: + .. automodule:: scrapple.selectors.xpath .. autoclass:: scrapple.selectors.xpath.XpathSelector From f651bcf1fec154ee7cd591b632c2fc37f80621b8 Mon Sep 17 00:00:00 2001 From: Alex Mathew Date: Fri, 20 Apr 2018 11:20:44 +0530 Subject: [PATCH 4/6] Fix xpatherror import --- scrapple/selectors/selector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapple/selectors/selector.py b/scrapple/selectors/selector.py index 27f25ca..bd01b9b 100644 --- a/scrapple/selectors/selector.py +++ b/scrapple/selectors/selector.py @@ -10,6 +10,7 @@ import requests from lxml import etree +from lxml.etree import XPathError try: from urlparse import urljoin From 7a53641ecd0cd39c8fecbfd1de644a58598efc68 Mon Sep 17 00:00:00 2001 From: Alex Mathew Date: Fri, 20 Apr 2018 14:43:12 +0530 Subject: [PATCH 5/6] Refactor extract_tabular and related methods in the selector classes --- scrapple/selectors/css.py | 141 +------------------------------- scrapple/selectors/selector.py | 143 +++++++++++++++++++++++++++++---- scrapple/selectors/xpath.py | 139 ++------------------------------ 3 files changed, 137 insertions(+), 286 deletions(-) diff --git a/scrapple/selectors/css.py b/scrapple/selectors/css.py index e2b2740..12155d1 100644 --- a/scrapple/selectors/css.py +++ b/scrapple/selectors/css.py @@ -29,142 +29,9 @@ def __init__(self, url): super(CssSelector, self).__init__(url) - def get_selected_tag(self, selector='', *args, **kwargs): + def get_selected_tag(self, selector='', get_one=False, *args, **kwargs): sel = cssselect.CSSSelector(selector) - tag = sel(self.tree)[0] + tags = sel(self.tree) + if get_one: + return tags[0] return tag - - - def get_links_for_crawling(self, selector='', *args, **kwargs): - sel = cssselect.CSSSelector(selector) - links = sel(self.tree) - return links - - - def extract_rows(self, *args, **kwargs): - """ - Row data extraction for extract_tabular - """ - result_list = [] - result = kwargs.get('result', {}) - - try: - sel = cssselect.CSSSelector(kwargs.get('selector', '')) - values = sel(self.tree) - if len(kwargs.get('table_headers', [])) >= len(values): - from itertools import izip_longest - pairs = izip_longest(kwargs.get('table_headers', []), values, fillvalue=kwargs.get('default', '')) - else: - from itertools import izip - pairs = izip(kwargs.get('table_headers', []), values) - for head, val in pairs: - if kwargs.get('verbosity', 0) > 1: - print("\nExtracting", head, "attribute", sep=' ', end='') - if kwargs.get('attr', 'text') == "text": - try: - content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in val.itertext()]) - except Exception: - content = kwargs.get('default', '') - content = content.replace("\n", " ").strip() - else: - content = val.get(kwargs.get('attr', 'text')) - if kwargs.get('attr', 'text') in ["href", "src"]: - content = urljoin(self.url, content) - result[head] = content - result_list.append(result) - except TypeError: - raise Exception("Selector expression string to be provided. Got " + kwargs.get('selector', '')) - - return result_list - - - def extract_columns(self, *args, **kwargs): - """ - Column data extraction for extract_tabular - """ - result_list = [] - result = kwargs.get('result', {}) - - try: - if type(kwargs.get('selector', '')) in [str, unicode]: - selectors = [kwargs.get('selector', '')] - elif type(kwargs.get('selector', '')) == list: - selectors = kwargs.get('selector', '') - else: - raise Exception("Use a list of selector expressions for the various columns") - from itertools import izip, count - pairs = izip(kwargs.get('table_headers', []), selectors) - columns = {} - for head, selector in pairs: - sel = cssselect.CSSSelector(selector) - columns[head] = sel(self.tree) - try: - for i in count(start=0): - r = result.copy() - for head in columns.keys(): - if kwargs.get('verbosity', 0) > 1: - print("\nExtracting", head, "attribute", sep=' ', end='') - col = columns[head][i] - if kwargs.get('attr', 'text') == "text": - try: - content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in col.itertext()]) - except Exception: - content = kwargs.get('default', '') - content = content.replace("\n", " ").strip() - else: - content = col.get(kwargs.get('attr', 'text')) - if kwargs.get('attr', 'text') in ["href", "src"]: - content = urljoin(self.url, content) - r[head] = content - result_list.append(r) - except IndexError: - pass - except TypeError: - raise Exception("Selector expression string to be provided. Got " + selector) - - return result_list - - - def extract_tabular(self, *args, **kwargs): - """ - Method for performing the extraction of tabular data. - - As in the extract_content method, the cssselect library is used to translate \ - the CSS selector expression into an XPath expression. - - :param result: A dictionary containing the extracted data so far - :param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \ - A row extraction is when there is a single row to be extracted and mapped to a set of headers. \ - A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings. - :param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers - :param prefix: A prefix to be added to each header - :param suffix: A suffix to be added to each header - :param selector: For row extraction, this is a selector that gives the row to be extracted. \ - For column extraction, this is a list of selectors for each column. - :param attr: The attribute to be extracted from the selected tag - :param default: The default value to be used if the selector does not return any data - :param verbosity: The verbosity set as the argument for scrapple run - :return: A 2-tuple containing the list of all the column headers extracted and the list of \ - dictionaries which contain (header, content) pairs - """ - result = kwargs.get('result', {}) - result_list = [] - if type(kwargs.get('header', [])) in [str, unicode]: - try: - sel = cssselect.CSSSelector(kwargs.get('header', [])) - header_list = sel(self.tree) - table_headers = [kwargs.get('prefix', '') + h.text + kwargs.get('suffix', '') for h in header_list] - if len(table_headers) == 0: - raise Exception("Invalid CSS selector " + kwargs.get('header', [])) - except TypeError: - raise Exception("Selector expression string to be provided. Got " + kwargs.get('header', [])) - else: - table_headers = [kwargs.get('prefix', '') + h + kwargs.get('suffix', '') for h in kwargs.get('header', [])] - if kwargs.get('table_type', 'rows') not in ["rows", "columns"]: - raise Exception("Specify 'rows' or 'columns' in table_type") - kwargs.update({'table_headers': table_headers}) - if kwargs.get('table_type', 'rows') == "rows": - result_list = self.extract_rows(**kwargs) - else: - result_list = self.extract_columns(**kwargs) - return table_headers, result_list diff --git a/scrapple/selectors/selector.py b/scrapple/selectors/selector.py index bd01b9b..85a388a 100644 --- a/scrapple/selectors/selector.py +++ b/scrapple/selectors/selector.py @@ -8,6 +8,8 @@ import random +from scrapple.utils.text import make_ascii + import requests from lxml import etree from lxml.etree import XPathError @@ -72,6 +74,10 @@ def __init__(self, url): raise Exception('Ensure that you are connected to the Internet and that the page exists') + def get_tree_tag(self, selector='', get_one=False, *args, **kwargs): + raise NotImplementedError + + def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs): """ Method for performing the content extraction for the particular selector type. \ @@ -93,14 +99,14 @@ def extract_content(self, selector='', attr='', default='', connector='', *args, :return: The extracted content """ try: - if selector == "url": + if selector.lower() == "url": return self.url - if attr == "text": - tag = self.get_selected_tag(selector=selector) + if attr.lower() == "text": + tag = self.get_tree_tag(selector=selector, get_one=True) content = connector.join([make_ascii(x).strip() for x in tag.itertext()]) content = content.replace("\n", " ").strip() else: - tag = self.get_selected_tag(selector=selector) + tag = self.get_tree_tag(selector=selector) content = tag.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) @@ -113,10 +119,6 @@ def extract_content(self, selector='', attr='', default='', connector='', *args, raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) - def get_selected_tag(self, selector='', *args, **kwargs): - raise NotImplementedError - - def extract_links(self, selector='', *args, **kwargs): """ Method for performing the link extraction for the crawler. \ @@ -135,7 +137,7 @@ def extract_links(self, selector='', *args, **kwargs): """ try: - links = self.get_links_for_crawling(selector=selector) + links = self.get_tree_tag(selector=selector) for link in links: next_url = urljoin(self.url, link.get('href')) yield type(self)(next_url) @@ -145,14 +147,125 @@ def extract_links(self, selector='', *args, **kwargs): raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) - def get_links_for_crawling(self, selector='', *args, **kwargs): - raise NotImplementedError + def extract_tabular(self, header='', prefix='', suffix='', table_type='', *args, **kwargs): + """ + Method for performing the tabular data extraction. \ + + :param result: A dictionary containing the extracted data so far + :param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \ + A row extraction is when there is a single row to be extracted and mapped to a set of headers. \ + A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings. + :param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers + :param prefix: A prefix to be added to each header + :param suffix: A suffix to be added to each header + :param selector: For row extraction, this is a selector that gives the row to be extracted. \ + For column extraction, this is a list of selectors for each column. + :param attr: The attribute to be extracted from the selected tag + :param default: The default value to be used if the selector does not return any data + :param verbosity: The verbosity set as the argument for scrapple run + :return: A 2-tuple containing the list of all the column headers extracted and the list of \ + dictionaries which contain (header, content) pairs + """ + if type(header) in [str, unicode]: + try: + header_list = self.get_tree_tag(header) + table_headers = [prefix + h.text + suffix for h in header_list] + except XPathError: + raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header)) + except Exception: + raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header)) + else: + table_headers = [prefix + h + suffix for h in header] + if len(table_headers) == 0: + raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header)) + if table_type not in ["rows", "columns"]: + raise Exception("Specify 'rows' or 'columns' in table_type") + if table_type == "rows": + result_list = self.extract_rows(table_headers=table_headers, *args, **kwargs) + else: + result_list = self.extract_columns(table_headers=table_headers, *args, **kwargs) + return table_headers, result_list - def extract_tabular(self, *args, **kwargs): + def extract_rows(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs): """ - Method for performing the tabular data extraction. \ - A detailed description is provided in the derived classes. + Row data extraction for extract_tabular + """ + result_list = [] + try: + values = self.get_tree_tag(selector) + if len(table_headers) >= len(values): + from itertools import izip_longest + pairs = izip_longest(table_headers, values, fillvalue=default) + else: + from itertools import izip + pairs = izip(table_headers, values) + for head, val in pairs: + if verbosity > 1: + print("\nExtracting", head, "attribute", sep=' ', end='') + if attr.lower() == "text": + try: + content = connector.join([make_ascii(x).strip() for x in val.itertext()]) + except Exception: + content = default + content = content.replace("\n", " ").strip() + else: + content = val.get(attr) + if attr in ["href", "src"]: + content = urljoin(self.url, content) + result[head] = content + result_list.append(result) + except XPathError: + raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) + except TypeError: + raise Exception("Selector expression string to be provided. Got " + selector) + + return result_list + + + def extract_columns(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs): """ - raise NotImplementedError + Column data extraction for extract_tabular + """ + result_list = [] + + try: + if type(selector) in [str, unicode]: + selectors = [selector] + elif type(selector) == list: + selectors = selector[:] + else: + raise Exception("Use a list of selector expressions for the various columns") + from itertools import izip, count + pairs = izip(table_headers, selectors) + columns = {} + for head, selector in pairs: + columns[head] = self.get_tree_tag(selector) + try: + for i in count(start=0): + r = result.copy() + for head in columns.keys(): + if verbosity > 1: + print("\nExtracting", head, "attribute", sep=' ', end='') + col = columns[head][i] + if attr == "text": + try: + content = connector.join([make_ascii(x).strip() for x in col.itertext()]) + except Exception: + content = default + content = content.replace("\n", " ").strip() + else: + content = col.get(attr) + if attr in ["href", "src"]: + content = urljoin(self.url, content) + r[head] = content + result_list.append(r) + except IndexError: + pass + except XPathError: + raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) + except TypeError: + raise Exception("Selector expression string to be provided. Got " + selector) + + return result_list diff --git a/scrapple/selectors/xpath.py b/scrapple/selectors/xpath.py index cae3dea..243c196 100644 --- a/scrapple/selectors/xpath.py +++ b/scrapple/selectors/xpath.py @@ -6,8 +6,6 @@ from __future__ import print_function -from lxml.etree import XPathError - from scrapple.selectors.selector import Selector from scrapple.utils.text import make_ascii @@ -29,135 +27,8 @@ def __init__(self, url): super(XpathSelector, self).__init__(url) - def get_selected_tag(self, selector='', *args, **kwargs): - tag = self.tree.xpath(selector)[0] - return tag - - - def get_links_for_crawling(self, selector='', *args, **kwargs): - links = self.tree.xpath(selector) - return links - - - def extract_rows(self, *args, **kwargs): - """ - Row data extraction for extract_tabular - """ - result_list = [] - result = kwargs.get('result', {}) - - try: - values = self.tree.xpath(kwargs.get('selector', '')) - if len(kwargs.get('table_headers', [])) >= len(values): - from itertools import izip_longest - pairs = izip_longest(kwargs.get('table_headers', []), values, fillvalue=kwargs.get('default', '')) - else: - from itertools import izip - pairs = izip(kwargs.get('table_headers', []), values) - for head, val in pairs: - if kwargs.get('verbosity', 0) > 1: - print("\nExtracting", head, "attribute", sep=' ', end='') - if kwargs.get('attr', 'text') == "text": - try: - content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in val.itertext()]) - except Exception: - content = kwargs.get('default', '') - content = content.replace("\n", " ").strip() - else: - content = val.get(kwargs.get('attr', 'text')) - if kwargs.get('attr', 'text') in ["href", "src"]: - content = urljoin(self.url, content) - result[head] = content - result_list.append(result) - except XPathError: - raise Exception("Invalid XPath selector " + kwargs.get('selector', '')) - except TypeError: - raise Exception("Selector expression string to be provided. Got " + kwargs.get('selector', '')) - - return result_list - - - def extract_columns(self, *args, **kwargs): - """ - Column data extraction for extract_tabular - """ - result_list = [] - result = kwargs.get('result', {}) - - try: - if type(kwargs.get('selector', '')) in [str, unicode]: - selectors = [kwargs.get('selector', '')] - elif type(kwargs.get('selector', '')) == list: - selectors = kwargs.get('selector', '') - else: - raise Exception("Use a list of selector expressions for the various columns") - from itertools import izip, count - pairs = izip(kwargs.get('table_headers', []), selectors) - columns = {} - for head, selector in pairs: - columns[head] = self.tree.xpath(selector) - try: - for i in count(start=0): - r = result.copy() - for head in columns.keys(): - if kwargs.get('verbosity', 0) > 1: - print("\nExtracting", head, "attribute", sep=' ', end='') - col = columns[head][i] - if kwargs.get('attr', 'text') == "text": - try: - content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in col.itertext()]) - except Exception: - content = kwargs.get('default', '') - content = content.replace("\n", " ").strip() - else: - content = col.get(kwargs.get('attr', 'text')) - if kwargs.get('attr', 'text') in ["href", "src"]: - content = urljoin(self.url, content) - r[head] = content - result_list.append(r) - except IndexError: - pass - except XPathError: - raise Exception("Invalid XPath selector " + selector) - except TypeError: - raise Exception("Selector expression string to be provided. Got " + selector) - - return result_list - - - def extract_tabular(self, *args, **kwargs): - """ - Method for performing the extraction of tabular data. - - :param result: A dictionary containing the extracted data so far - :param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \ - A row extraction is when there is a single row to be extracted and mapped to a set of headers. \ - A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings. - :param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers - :param prefix: A prefix to be added to each header - :param suffix: A suffix to be added to each header - :param selector: For row extraction, this is a selector that gives the row to be extracted. \ - For column extraction, this is a list of selectors for each column. - :param attr: The attribute to be extracted from the selected tag - :param default: The default value to be used if the selector does not return any data - :param verbosity: The verbosity set as the argument for scrapple run - :return: A 2-tuple containing the list of all the column headers extracted and the list of \ - dictionaries which contain (header, content) pairs - """ - result = kwargs.get('result', {}) - if type(kwargs.get('header', [])) in [str, unicode]: - try: - header_list = self.tree.xpath(kwargs.get('header', [])) - table_headers = [kwargs.get('prefix', '') + h.text + kwargs.get('suffix', '') for h in header_list] - except XPathError: - raise Exception("Invalid XPath selector " + kwargs.get('header', [])) - else: - table_headers = [kwargs.get('prefix', '') + h + kwargs.get('suffix', '') for h in kwargs.get('header', [])] - if kwargs.get('table_type', 'rows') not in ["rows", "columns"]: - raise Exception("Specify 'rows' or 'columns' in table_type") - kwargs.update({'table_headers': table_headers}) - if kwargs.get('table_type', 'rows') == "rows": - result_list = self.extract_rows(**kwargs) - else: - result_list = self.extract_columns(**kwargs) - return table_headers, result_list + def get_tree_tag(self, selector='', get_one=False, *args, **kwargs): + tags = self.tree.xpath(selector) + if get_one: + return tags[0] + return tags From 46f88b9702a581da5167c43368045198e9f26b51 Mon Sep 17 00:00:00 2001 From: Alex Mathew Date: Wed, 25 Apr 2018 13:18:33 +0530 Subject: [PATCH 6/6] Fix minor issues and update tests --- scrapple/commands/run.py | 2 +- scrapple/selectors/css.py | 4 +- scrapple/selectors/selector.py | 2 +- tests/expected_result2.json | 1 - tests/expected_result2_20180428.json | 95 ++++++++++++++++++++++++++++ tests/expected_result3.json | 2 +- tests/project2.json | 56 ++++++++-------- tests/project3.json | 25 +++----- tests/test_generate.py | 2 +- tests/test_run.py | 2 +- 10 files changed, 136 insertions(+), 55 deletions(-) delete mode 100644 tests/expected_result2.json create mode 100644 tests/expected_result2_20180428.json diff --git a/scrapple/commands/run.py b/scrapple/commands/run.py index 50a54e1..bbbe13d 100644 --- a/scrapple/commands/run.py +++ b/scrapple/commands/run.py @@ -128,7 +128,7 @@ def run(self): import json with open(os.path.join(os.getcwd(), self.args[''] + '.json'), \ 'w') as f: - json.dump(results, f, indent=3) + json.dump(results, f, indent=4) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args[''] + '.csv'), \ diff --git a/scrapple/selectors/css.py b/scrapple/selectors/css.py index 12155d1..fb26a1d 100644 --- a/scrapple/selectors/css.py +++ b/scrapple/selectors/css.py @@ -29,9 +29,9 @@ def __init__(self, url): super(CssSelector, self).__init__(url) - def get_selected_tag(self, selector='', get_one=False, *args, **kwargs): + def get_tree_tag(self, selector='', get_one=False, *args, **kwargs): sel = cssselect.CSSSelector(selector) tags = sel(self.tree) if get_one: return tags[0] - return tag + return tags diff --git a/scrapple/selectors/selector.py b/scrapple/selectors/selector.py index 85a388a..1f824d3 100644 --- a/scrapple/selectors/selector.py +++ b/scrapple/selectors/selector.py @@ -106,7 +106,7 @@ def extract_content(self, selector='', attr='', default='', connector='', *args, content = connector.join([make_ascii(x).strip() for x in tag.itertext()]) content = content.replace("\n", " ").strip() else: - tag = self.get_tree_tag(selector=selector) + tag = self.get_tree_tag(selector=selector, get_one=True) content = tag.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) diff --git a/tests/expected_result2.json b/tests/expected_result2.json deleted file mode 100644 index 55e1365..0000000 --- a/tests/expected_result2.json +++ /dev/null @@ -1 +0,0 @@ -{"project": "project2", "data": [{"event": "Event: Boston Python Meetup", "talk": "How to test the hard stuff"}, {"event": "Event: Boston Python Meetup", "talk": "Testing: Where do I start?"}]} \ No newline at end of file diff --git a/tests/expected_result2_20180428.json b/tests/expected_result2_20180428.json new file mode 100644 index 0000000..bdeac7b --- /dev/null +++ b/tests/expected_result2_20180428.json @@ -0,0 +1,95 @@ +{ + "project": "project2", + "data": [ + { + "team": "Atlanta Hawks" + }, + { + "team": "Boston Celtics" + }, + { + "team": "Brooklyn Nets" + }, + { + "team": "Charlotte Hornets" + }, + { + "team": "Chicago Bulls" + }, + { + "team": "Cleveland Cavaliers" + }, + { + "team": "Dallas Mavericks" + }, + { + "team": "Denver Nuggets" + }, + { + "team": "Detroit Pistons" + }, + { + "team": "Golden State Warriors" + }, + { + "team": "Houston Rockets" + }, + { + "team": "Indiana Pacers" + }, + { + "team": "Los Angeles Clippers" + }, + { + "team": "Los Angeles Lakers" + }, + { + "team": "Memphis Grizzlies" + }, + { + "team": "Miami Heat" + }, + { + "team": "Milwaukee Bucks" + }, + { + "team": "Minnesota Timberwolves" + }, + { + "team": "New Orleans Pelicans" + }, + { + "team": "New York Knicks" + }, + { + "team": "Oklahoma City Thunder" + }, + { + "team": "Orlando Magic" + }, + { + "team": "Philadelphia 76ers" + }, + { + "team": "Phoenix Suns" + }, + { + "team": "Portland Trail Blazers" + }, + { + "team": "Sacramento Kings" + }, + { + "team": "San Antonio Spurs" + }, + { + "team": "Toronto Raptors" + }, + { + "team": "Utah Jazz" + }, + { + "team": "Washington Wizards" + } + ] +} \ No newline at end of file diff --git a/tests/expected_result3.json b/tests/expected_result3.json index caada20..b1acc98 100644 --- a/tests/expected_result3.json +++ b/tests/expected_result3.json @@ -1 +1 @@ -{"project": "project3", "data": [{"unknown": "", "speaker": "Kenneth Reitz", "talk_url": "http://pyvideo.org/pycon-us-2013/python-for-humans-1.html", "title": "Python for Humans"}]} \ No newline at end of file +{"project": "project3", "data": [{"show_url": "https://trakt.tv/shows/mr-robot", "year": "2015", "unknown": ""}]} \ No newline at end of file diff --git a/tests/project2.json b/tests/project2.json index 1c55aeb..16edcbf 100644 --- a/tests/project2.json +++ b/tests/project2.json @@ -1,32 +1,26 @@ { - "project_name": "new_project2", - "selector_type": "css", - "scraping": { - "url": "http://pyvideo.org/events/boston-python-meetup.html", - "data": [ - { - "field": "event", - "selector": "h2", - "attr": "text", - "connector": "", - "default": "" - } - ], - "next": [ - { - "follow_link": "div.content-list div.row h4.entry-title a", - "scraping": { - "data": [ - { - "field": "talk", - "selector": "h2", - "attr": "text", - "connector": "", - "default": "" - } - ] - } - } - ] - } -} \ No newline at end of file + "project_name": "project2_nba", + "selector_type": "css", + "scraping": { + "url": "https://www.basketball-reference.com/teams/", + "data": [ + + ], + "next": [ + { + "follow_link": "#teams_active th > a", + "scraping": { + "data": [ + { + "field": "team", + "selector": "div#info h1", + "attr": "text", + "default": "", + "connector": "" + } + ] + } + } + ] + } +} diff --git a/tests/project3.json b/tests/project3.json index 96e7fbb..3912626 100644 --- a/tests/project3.json +++ b/tests/project3.json @@ -2,35 +2,28 @@ "project_name": "project3", "selector_type": "css", "scraping": { - "url": "http://pyvideo.org/pycon-us-2013/python-for-humans-1.html", + "url": "https://trakt.tv/shows/mr-robot", "data": [ { - "field": "unknown", - "selector": "h1", - "attr": "text", - "connector": "", - "default": "" - }, - { - "field": "talk_url", + "field": "show_url", "selector": "url", "attr": "", "connector": "", - "default": "" + "default": "" }, { - "field": "title", - "selector": "h2", + "field": "unknown", + "selector": "h6", "attr": "text", "connector": "", - "default": "" + "default": "<unknown>" }, { - "field": "speaker", - "selector": ".author a", + "field": "year", + "selector": "span.year", "attr": "text", "connector": "", - "default": "<speaker>" + "default": "<year>" } ] } diff --git a/tests/test_generate.py b/tests/test_generate.py index dbfb38f..ab18c2f 100644 --- a/tests/test_generate.py +++ b/tests/test_generate.py @@ -51,7 +51,7 @@ def test_css_scraper_generate(): with open(os.path.join(os.getcwd(), 'project2.py'), 'r') as f: program = f.read() assert_in("from scrapple.selectors.css import CssSelector", program) - assert_in('page0 = CssSelector("http://pyvideo.org/events/boston-python-meetup.html")', program) + assert_in('page0 = CssSelector("https://www.basketball-reference.com/teams/")', program) def test_nonexistent_project(): diff --git a/tests/test_run.py b/tests/test_run.py index 27e19f6..39cb5d7 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -62,7 +62,7 @@ def test_run_css_crawler(): rc.execute_command() with open(os.path.join(os.getcwd(), 'result2.json'), 'r') as f: result = json.load(f) - with open(os.path.join(os.getcwd(), 'expected_result2.json'), 'r') as f: + with open(os.path.join(os.getcwd(), 'expected_result2_20180428.json'), 'r') as f: expected_result = json.load(f) assert_dict_equal(result, expected_result)