From 247c5bbbd84566496746cd0a45ee5bf2c3e38057 Mon Sep 17 00:00:00 2001 From: king-millez Date: Mon, 26 Oct 2020 20:19:56 +1100 Subject: [PATCH 1/6] Start vic_parliament --- ausbills/vic_parliament.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 ausbills/vic_parliament.py diff --git a/ausbills/vic_parliament.py b/ausbills/vic_parliament.py new file mode 100644 index 0000000..d974c17 --- /dev/null +++ b/ausbills/vic_parliament.py @@ -0,0 +1,5 @@ +import json +from requests import get +from bs4 import BeautifulSoup + +api_url = 'https://www.legislation.vic.gov.au/app/api/search/title_content?from=0&size=9999&includeFilters[0][prefix][title_az]=&includeFilters[1][term][type]=bill&includeFields[0]=title&includeFields[1]=field_in_force_former_title&includeFields[2]=url&includeFields[3]=type&includeFields[4]=legislation_type&includeFields[5]=field_act_sr_number&includeFields[6]=legislation_year&includeFields[7]=field_act_sr_status_date&includeFields[8]=field_legislation_status&includeFields[9]=field_bill_pre_2004&includeFields[10]=field_bill_parliament_term&sort[0][_score]=desc&sort[1][title_az]=asc&aggregations[legislation_year][terms][field]=legislation_year&aggregations[legislation_year][terms][order][_key]=desc&aggregations[legislation_year][terms][size]=250&aggregations[field_legislation_status][terms][field]=field_legislation_status&aggregations[field_legislation_status][terms][order][_key]=asc&aggregations[field_legislation_status][terms][size]=250' \ No newline at end of file From c5d6d4bb0a96a9a108669486ad81429e105b1261 Mon Sep 17 00:00:00 2001 From: king-millez Date: Tue, 27 Oct 2020 14:33:09 +1100 Subject: [PATCH 2/6] Add vic_All_Bills() --- ausbills/vic_parliament.py | 39 +++++++++++++++++++++++++++++++++++++- tests/test_vic.py | 12 ++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 tests/test_vic.py diff --git a/ausbills/vic_parliament.py b/ausbills/vic_parliament.py index d974c17..2c1d164 100644 --- a/ausbills/vic_parliament.py +++ b/ausbills/vic_parliament.py @@ -2,4 +2,41 @@ from requests import get from bs4 import BeautifulSoup -api_url = 'https://www.legislation.vic.gov.au/app/api/search/title_content?from=0&size=9999&includeFilters[0][prefix][title_az]=&includeFilters[1][term][type]=bill&includeFields[0]=title&includeFields[1]=field_in_force_former_title&includeFields[2]=url&includeFields[3]=type&includeFields[4]=legislation_type&includeFields[5]=field_act_sr_number&includeFields[6]=legislation_year&includeFields[7]=field_act_sr_status_date&includeFields[8]=field_legislation_status&includeFields[9]=field_bill_pre_2004&includeFields[10]=field_bill_parliament_term&sort[0][_score]=desc&sort[1][title_az]=asc&aggregations[legislation_year][terms][field]=legislation_year&aggregations[legislation_year][terms][order][_key]=desc&aggregations[legislation_year][terms][size]=250&aggregations[field_legislation_status][terms][field]=field_legislation_status&aggregations[field_legislation_status][terms][order][_key]=asc&aggregations[field_legislation_status][terms][size]=250' \ No newline at end of file +base_url = 'https://www.legislation.vic.gov.au/' +api_url = base_url + 'app/api/search/title_content?from=0&size=9999&includeFilters[0][prefix][title_az]=&includeFilters[1][term][type]=bill&includeFields[0]=title&includeFields[1]=field_in_force_former_title&includeFields[2]=url&includeFields[3]=type&includeFields[4]=legislation_type&includeFields[5]=field_act_sr_number&includeFields[6]=legislation_year&includeFields[7]=field_act_sr_status_date&includeFields[8]=field_legislation_status&includeFields[9]=field_bill_pre_2004&includeFields[10]=field_bill_parliament_term&sort[0][_score]=desc&sort[1][title_az]=asc&aggregations[legislation_year][terms][field]=legislation_year&aggregations[legislation_year][terms][order][_key]=desc&aggregations[legislation_year][terms][size]=250&aggregations[field_legislation_status][terms][field]=field_legislation_status&aggregations[field_legislation_status][terms][order][_key]=asc&aggregations[field_legislation_status][terms][size]=250' + +URL = 'url' +ID = 'id' +SHORT_TITLE = 'short_title' +STATUS = 'status' +YEAR = 'year' +BILL_TYPE = 'bill_type' + +class vic_All_Bills(object): + _bills_data = [] + + def __init__(self): + try: + self.scrape_data() + except Exception as e: + raise Exception('Error when scraping bills:\n' + e) + + def scrape_data(self): + data_json = json.loads(get(api_url).text) + results = data_json['results'] + for result in range(len(results)): + _result = results[result] + _bill_url = base_url + _result['url'][0][8:] + _bill_id = _result['_id'][12:-3] + _bill_status = _result['field_legislation_status'][0] + _bill_year = _result['legislation_year'][0] + _bill_title = _result['title'][0] + _bill_type = _result['type'][0] + bill_dict = {URL: _bill_url, ID: _bill_id, STATUS: _bill_status, YEAR: _bill_year, SHORT_TITLE: _bill_title, BILL_TYPE: _bill_type} + self._bills_data.append(bill_dict) + + @property + def data(self): + return(self._bills_data) + +vic_all_bills = vic_All_Bills().data \ No newline at end of file diff --git a/tests/test_vic.py b/tests/test_vic.py new file mode 100644 index 0000000..4f177f7 --- /dev/null +++ b/tests/test_vic.py @@ -0,0 +1,12 @@ +from ausbills.vic_parliament import vic_all_bills +import pytest +import random +import io + +def test_vic(): + all_the_bills_mate = vic_all_bills + random_numbers = [int(random.random()*len(all_the_bills_mate)) for i in range(5)] + bills_sample = [all_the_bills_mate[i] for i in random_numbers] + + for bill in bills_sample: + print(bill) \ No newline at end of file From f5742b8f00fa4b258c539489a14040f56037b99c Mon Sep 17 00:00:00 2001 From: king-millez Date: Wed, 28 Oct 2020 13:36:57 +1100 Subject: [PATCH 3/6] Start vic_Bill() object --- ausbills/vic_parliament.py | 81 +++++++++++++++++++++++++++++++++++++- setup.py | 2 +- tests/test_vic.py | 6 +-- 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/ausbills/vic_parliament.py b/ausbills/vic_parliament.py index 2c1d164..319476a 100644 --- a/ausbills/vic_parliament.py +++ b/ausbills/vic_parliament.py @@ -1,3 +1,4 @@ +import re import json from requests import get from bs4 import BeautifulSoup @@ -9,8 +10,13 @@ ID = 'id' SHORT_TITLE = 'short_title' STATUS = 'status' +STATUS_REPORT = 'status_report' YEAR = 'year' BILL_TYPE = 'bill_type' +EXPLANATORY_MEMORANDUM = 'explanatory_memorandum' +LOWER_SPONSOR = 'lower_sponsor' +CIRCULATION_PRINT = 'circulation_print' +UPPER_SPONSOR = 'upper_sponsor' class vic_All_Bills(object): _bills_data = [] @@ -39,4 +45,77 @@ def scrape_data(self): def data(self): return(self._bills_data) -vic_all_bills = vic_All_Bills().data \ No newline at end of file +vic_all_bills = vic_All_Bills().data + +class vic_Bill(object): + def __init__(self, input): + if(isinstance(input, dict)): + try: + self.create_vars(input) + except Exception as e: + raise Exception('Dict must have correct keys, missing key ' + e) + else: + raise ValueError('Input data must be valid vic_Bill dict data...') + + def create_vars(self, init_data): + self._bill_data = init_data + self.url = init_data[URL] + self.id = init_data[ID] + self.status = init_data[STATUS] + self.year = init_data[YEAR] + self.short_title = init_data[SHORT_TITLE] + self.bill_type = init_data[BILL_TYPE] + try: + self.bill_soup = BeautifulSoup(get(self.url).text, 'lxml') + except: + raise Exception('Unable to scrape ' + self.url) + + @property + def status_report(self): + try: + return(self.bill_soup.find('li', {'data-tid': 'Status Report'}).find('a')['href'].strip()) + except: + return '' + + @property + def explanatory_memorandum(self): + try: + return(self.bill_soup.find('li', {'data-tid': re.compile(r'(Introduction print – Explanatory Memorandum|Circulation print – Explanatory Memorandum)')}).find('a')['href'].strip()) + except: + return '' + + @property + def circulation_print(self): + try: + return(self.bill_soup.find('li', {'data-tid': 'Circulation print – Bill'}).find('a')['href'].strip()) + except: + return '' + + @property + def introduction_print(self): + try: + return(self.bill_soup.find('li', {'data-tid': 'Introduction print – Bill'}).find('a')['href'].strip()) + except: + return '' + + @property + def lower_sponsor(self): + try: + return(self.bill_soup.find_all('span', {'class': 'lgs-bill-table__term-title--bold'})[0].text.replace('Hon', '').replace('.', '').strip()) + except: + return '' + + @property + def upper_sponsor(self): + try: + return(self.bill_soup.find_all('span', {'class': 'lgs-bill-table__term-title--bold'})[-1].text.replace('Hon', '').replace('.', '').strip()) + except: + return '' + + @property + def data(self): + self._bill_data[STATUS_REPORT] = self.status_report + self._bill_data[EXPLANATORY_MEMORANDUM] = self.explanatory_memorandum + self._bill_data[LOWER_SPONSOR] = self.lower_sponsor + self._bill_data[CIRCULATION_PRINT] = self.circulation_print + return(self._bill_data) \ No newline at end of file diff --git a/setup.py b/setup.py index c0fe6a8..94c764e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import sys from setuptools.command.install import install -VERSION = "0.4.0" +VERSION = "0.8.0" with open("README.md", "r") as fh: long_description = fh.read() diff --git a/tests/test_vic.py b/tests/test_vic.py index 4f177f7..689a719 100644 --- a/tests/test_vic.py +++ b/tests/test_vic.py @@ -1,12 +1,12 @@ -from ausbills.vic_parliament import vic_all_bills +from ausbills.vic_parliament import vic_all_bills, vic_Bill import pytest import random import io def test_vic(): all_the_bills_mate = vic_all_bills - random_numbers = [int(random.random()*len(all_the_bills_mate)) for i in range(5)] + random_numbers = [int(random.random()*len(all_the_bills_mate)) for i in range(10)] bills_sample = [all_the_bills_mate[i] for i in random_numbers] for bill in bills_sample: - print(bill) \ No newline at end of file + print(vic_Bill(bill).lower_sponsor) \ No newline at end of file From 75710ebc02f87fe673917d386d16910686b6e4a7 Mon Sep 17 00:00:00 2001 From: king-millez Date: Wed, 28 Oct 2020 17:17:05 +1100 Subject: [PATCH 4/6] Finish vic_Bill() --- ausbills/act_parliament.py | 2 - ausbills/nt_parliament.py | 2 - ausbills/qld_parliament.py | 2 - ausbills/sa_parliament.py | 2 - ausbills/tas_parliament.py | 2 - ausbills/vic_parliament.py | 123 ++++++++++++++++++++++++++++++++++++- tests/test_vic.py | 3 +- 7 files changed, 123 insertions(+), 13 deletions(-) diff --git a/ausbills/act_parliament.py b/ausbills/act_parliament.py index 1e49310..999640b 100644 --- a/ausbills/act_parliament.py +++ b/ausbills/act_parliament.py @@ -152,8 +152,6 @@ def data(self): act_all_bills = All_Bills().data class act_Bill(object): - _all_bills = act_all_bills - def __init__(self, input): if(isinstance(input, dict)): try: diff --git a/ausbills/nt_parliament.py b/ausbills/nt_parliament.py index 4a894c9..a302070 100644 --- a/ausbills/nt_parliament.py +++ b/ausbills/nt_parliament.py @@ -48,8 +48,6 @@ def data(self): nt_all_bills = nt_All_Bills().data class nt_Bill(object): - _nt_all_bills = nt_all_bills - def __init__(self, input): if(isinstance(input, dict)): try: diff --git a/ausbills/qld_parliament.py b/ausbills/qld_parliament.py index 1b44783..c04af0c 100644 --- a/ausbills/qld_parliament.py +++ b/ausbills/qld_parliament.py @@ -55,8 +55,6 @@ def data(self): qld_all_bills = qld_All_Bills().data class qld_Bill(object): - _all_bills = qld_all_bills - def __init__(self, input): if(isinstance(input, dict)): try: diff --git a/ausbills/sa_parliament.py b/ausbills/sa_parliament.py index e339a93..c22d53d 100644 --- a/ausbills/sa_parliament.py +++ b/ausbills/sa_parliament.py @@ -42,8 +42,6 @@ def data(self): sa_all_bills = sa_All_Bills().data class sa_Bill(object): - _all_bills = sa_all_bills - def __init__(self, input): if(isinstance(input, dict)): try: diff --git a/ausbills/tas_parliament.py b/ausbills/tas_parliament.py index 858934d..8bf0c48 100644 --- a/ausbills/tas_parliament.py +++ b/ausbills/tas_parliament.py @@ -53,8 +53,6 @@ def data(self): tas_all_bills = tas_All_Bills().data class tas_Bill(object): - _all_bills = tas_all_bills - def __init__(self, input): if(isinstance(input, dict)): try: diff --git a/ausbills/vic_parliament.py b/ausbills/vic_parliament.py index 319476a..c9858b9 100644 --- a/ausbills/vic_parliament.py +++ b/ausbills/vic_parliament.py @@ -17,6 +17,21 @@ LOWER_SPONSOR = 'lower_sponsor' CIRCULATION_PRINT = 'circulation_print' UPPER_SPONSOR = 'upper_sponsor' +INTRODUCTION_PRINT = 'introduction_print' +PASSED_PRINT = 'passed_print' +AMENDED_PRINT = 'amended_print' +LOWER_FIRST_READING_PASSED = 'lower_first_reading_passed' +LOWER_SECOND_READING_PASSED = 'lower_second_reading_passed' +LOWER_SECOND_READING_MOVED = 'lower_second_reading_moved' +LOWER_THIRD_READING_PASSED = 'lower_third_reading_passed' +LOWER_THIRD_READING_MOVED = 'lower_third_reading_moved' +UPPER_FIRST_READING_PASSED = 'upper_first_reading_passed' +UPPER_SECOND_READING_PASSED = 'upper_second_reading_passed' +UPPER_SECOND_READING_MOVED = 'upper_second_reading_moved' +UPPER_THIRD_READING_PASSED = 'upper_third_reading_passed' +UPPER_THIRD_READING_MOVED = 'upper_third_reading_moved' +ASSENTED = 'assented' +ACT_NO = 'act_no' class vic_All_Bills(object): _bills_data = [] @@ -69,7 +84,21 @@ def create_vars(self, init_data): self.bill_soup = BeautifulSoup(get(self.url).text, 'lxml') except: raise Exception('Unable to scrape ' + self.url) - + + @property + def act_no(self): + try: + div = self.bill_soup.find('div', {'data-tid': 'Royal Assent'}) + li = div.find_all('li')[-1] + text = li.find('dt', {'class': 'lgs-bill-table__definition-term lgs-bill-table__term-title'}) + return(text.text.replace('Act number ', '').replace('/', '-').strip()) + except: + return '' + + @property + def assented(self): + return(self.find_reading('U', 'Royal Assent given')) + @property def status_report(self): try: @@ -80,7 +109,7 @@ def status_report(self): @property def explanatory_memorandum(self): try: - return(self.bill_soup.find('li', {'data-tid': re.compile(r'(Introduction print – Explanatory Memorandum|Circulation print – Explanatory Memorandum)')}).find('a')['href'].strip()) + return(self.bill_soup.find('li', {'data-tid': re.compile(r'(Introduction print – Explanatory Memorandum|Circulation print – Explanatory Memorandum|Amended print – Explanatory Memorandum)')}).find('a')['href'].strip()) except: return '' @@ -112,10 +141,100 @@ def upper_sponsor(self): except: return '' + @property + def passed_print(self): + try: + return(self.bill_soup.find('li', {'data-tid': 'As passed print – Bill'}).find('a')['href'].strip()) + except: + return '' + + @property + def amended_print(self): + try: + return(self.bill_soup.find('li', {'data-tid': 'Amended print – Bill'}).find('a')['href'].strip()) + except: + return '' + + @property + def lower_first_reading_passed(self): + return(self.find_reading('L', 'First reading passed')) + + @property + def lower_second_reading_passed(self): + return(self.find_reading('L', 'Second reading passed')) + + @property + def lower_third_reading_passed(self): + return(self.find_reading('L', 'Third reading passed')) + + @property + def lower_second_reading_moved(self): + return(self.find_reading('L', 'Second reading moved')) + + @property + def lower_third_reading_moved(self): + return(self.find_reading('L', 'Third reading moved')) + + @property + def upper_first_reading_passed(self): + return(self.find_reading('U', 'First reading passed')) + + @property + def upper_second_reading_passed(self): + return(self.find_reading('U', 'Second reading passed')) + + @property + def upper_third_reading_passed(self): + return(self.find_reading('U', 'Third reading passed')) + + @property + def upper_second_reading_moved(self): + return(self.find_reading('U', 'Second reading moved')) + + @property + def upper_third_reading_moved(self): + return(self.find_reading('U', 'Third reading moved')) + + def find_reading(self, house, input_keyword): + if house == 'L': + try: + dat = self.bill_soup.find(text=input_keyword) + return(self.format_date(dat.parent.parent.find('dd', {'class': 'lgs-bill-table__definition-term lgs-bill-table__term-date'}).text)) + except: + return '' + elif house == 'U': + try: + dat = self.bill_soup.find_all(text=input_keyword)[-1] + return(self.format_date(dat.parent.parent.find('dd', {'class': 'lgs-bill-table__definition-term lgs-bill-table__term-date'}).text)) + except: + return '' + else: + raise ValueError('No such house: ' + house) + + def format_date(self, input_date): + dateSplit = input_date.split('/', 2) + return("{:0>2s}".format(dateSplit[0]) + '-' "{:0>2s}".format(dateSplit[1]) + '-' + dateSplit[2]) + @property def data(self): self._bill_data[STATUS_REPORT] = self.status_report self._bill_data[EXPLANATORY_MEMORANDUM] = self.explanatory_memorandum self._bill_data[LOWER_SPONSOR] = self.lower_sponsor + self._bill_data[UPPER_SPONSOR] = self.upper_sponsor self._bill_data[CIRCULATION_PRINT] = self.circulation_print + self._bill_data[INTRODUCTION_PRINT] = self.introduction_print + self._bill_data[PASSED_PRINT] = self.passed_print + self._bill_data[AMENDED_PRINT] = self.amended_print + self._bill_data[LOWER_FIRST_READING_PASSED] = self.lower_first_reading_passed + self._bill_data[LOWER_SECOND_READING_MOVED] = self.lower_second_reading_moved + self._bill_data[LOWER_SECOND_READING_PASSED] = self.lower_second_reading_passed + self._bill_data[LOWER_THIRD_READING_MOVED] = self.lower_third_reading_moved + self._bill_data[LOWER_THIRD_READING_PASSED] = self.lower_third_reading_passed + self._bill_data[UPPER_FIRST_READING_PASSED] = self.upper_first_reading_passed + self._bill_data[UPPER_SECOND_READING_MOVED] = self.upper_second_reading_moved + self._bill_data[UPPER_SECOND_READING_PASSED] = self.upper_second_reading_passed + self._bill_data[UPPER_THIRD_READING_MOVED] = self.upper_third_reading_moved + self._bill_data[UPPER_THIRD_READING_PASSED] = self.upper_third_reading_passed + self._bill_data[ASSENTED] = self.assented + self._bill_data[ACT_NO] = self.act_no return(self._bill_data) \ No newline at end of file diff --git a/tests/test_vic.py b/tests/test_vic.py index 689a719..12d751c 100644 --- a/tests/test_vic.py +++ b/tests/test_vic.py @@ -1,6 +1,7 @@ from ausbills.vic_parliament import vic_all_bills, vic_Bill import pytest import random +import json import io def test_vic(): @@ -9,4 +10,4 @@ def test_vic(): bills_sample = [all_the_bills_mate[i] for i in random_numbers] for bill in bills_sample: - print(vic_Bill(bill).lower_sponsor) \ No newline at end of file + print(json.dumps(vic_Bill(bill).data, indent=2)) \ No newline at end of file From 406d7becccb3cf8d1847b29db601b742c9292502 Mon Sep 17 00:00:00 2001 From: king-millez Date: Wed, 28 Oct 2020 17:34:42 +1100 Subject: [PATCH 5/6] Remove other parliament changes --- ausbills/act_parliament.py | 289 ------------------------------------- ausbills/nt_parliament.py | 143 ------------------ ausbills/qld_parliament.py | 150 ------------------- ausbills/sa_parliament.py | 93 ------------ ausbills/tas_parliament.py | 205 -------------------------- 5 files changed, 880 deletions(-) delete mode 100644 ausbills/act_parliament.py delete mode 100644 ausbills/nt_parliament.py delete mode 100644 ausbills/qld_parliament.py delete mode 100644 ausbills/sa_parliament.py delete mode 100644 ausbills/tas_parliament.py diff --git a/ausbills/act_parliament.py b/ausbills/act_parliament.py deleted file mode 100644 index 999640b..0000000 --- a/ausbills/act_parliament.py +++ /dev/null @@ -1,289 +0,0 @@ -from bs4 import BeautifulSoup -import requests -import datetime -import calendar -import re - -DATE = 'date' -URL = 'url' -TITLE = 'title' -DESCRIPTION = 'description' -PRESENTED_BY = 'presented_by' -TYPE = 'type' -STATUS = 'status' -TEXT_URL = 'text_url' -SCRUTINY_REPORT = 'scrutiny_report' -PRESENTATION_SPEECH = 'presentation_speech' -HANSARD = 'hansard' -EXPLANATORY_STATEMENT = 'explanatory_statement' -COMPATIBILITY_STATEMENT = 'compatibility_statement' - -ninth_assembly_bills = "https://www.parliament.act.gov.au/parliamentary-business/in-the-chamber/bills/summary_of_bills" -eighth_assembly_bills = "https://www.parliament.act.gov.au/parliamentary-business/in-the-chamber/previous-assemblies/eighth-assembly/summary_of_bills" -ninth_siteData = requests.get(ninth_assembly_bills).text -eighth_siteData = requests.get(eighth_assembly_bills).text - -class All_Bills(object): - _bills_data = [] # This list will end up containing all the bill dict entries, and is the data returned. - - def __init__(self): - self._build_dataset() - - def _build_dataset(self): - try: - self._scrape_9th_assembly() - except Exception as e: - print('An exception ocurred when trying to scrape the 9th Assembly:\n') - print(e) - - try: - self._scrape_8th_assembly() - except Exception as e: - print('An exception ocurred when trying to scrape the 8th Assembly:\n') - print(e) - - def _scrape_9th_assembly(self): - billPres = [] - billDescs = [] - - soup = BeautifulSoup(ninth_siteData, 'html.parser') - div = soup.find("div", {"id": "main"}) - billTitles = div.find_all('h4') - for h4 in div.find_all('h4'): - h4.replace_with('') # Remove all

s from the soup, this makes it less annoying to get the bill presenter string from tags. The ACT Government, man, it's weird. - - billData = div.find_all(re.compile(r'(div|p)')) - allStrong = div.find_all('strong') - for strong in range(len(allStrong)): - if('This bill' in allStrong[strong].text or 'e bill will also' in allStrong[strong].text): - pass - else: - billPres.append(allStrong[strong]) - for entry in billData: - if "This bill" in entry.text or "this bill" in entry.text: - billDescs.append(entry) - - for title in range(len(billTitles)): # Here we loop through every bill and compile its information into an entry in _bills_data - _bill_title = billTitles[title].text - a = billTitles[title].find('a') - if(a == None): - if('Aboriginal and Torres Strait Islander Elected Body Amendment Bill 2020' in billTitles[title].text): - _bill_url = 'https://www.legislation.act.gov.au/a/2020-36/' - else: - _bill_url = '' - else: - _bill_url = a['href'] - _bill_description = billDescs[title].text - _bill_presented_by = self._format_presenter_9th(billPres[title].text)[0][13:] - _bill_date = self._format_presenter_9th(billPres[title].text)[1] - - bill_dict = {URL: _bill_url, TITLE: _bill_title, DESCRIPTION: _bill_description.replace("\xa0\xa0", " ").replace('‑', '-'), PRESENTED_BY: _bill_presented_by, DATE: _bill_date} - - self._bills_data.append(bill_dict) - - def _format_presenter_9th(self, title): - formatted = [] - splitUp = title.replace('\xa0', ' ').split('—', 1) - formatted.append(splitUp[0]) - dateSplit = splitUp[1].split(' ', 2) - monthNum = list(calendar.month_name).index(dateSplit[1]) - finalDate = dateSplit[2] + '-' + "{0:0=2d}".format(monthNum) + '-' + "{0:0=2d}".format(int(dateSplit[0])) - formatted.append(finalDate) - return(formatted) - - def _scrape_8th_assembly(self): - billDescs = [] - billTitles = [] - billScrutinyReports = [] - - soup = BeautifulSoup(eighth_siteData, 'html.parser') - div = soup.find('div', {'id': 'main'}) - paras = div.find_all('p')[8:] - for p in range(len(paras)): - if "" in str(paras[p]): - billTitles.append(paras[p]) - - elif "y Report " in str(paras[p]) or "Statement" in str(paras[p]): - reports = paras[p].find_all('a', {'href': True}) - urls = [] - for report in range(len(reports)): - urls.append(reports[report]['href']) - billScrutinyReports.append(urls) - - else: - billDescs.append(paras[p].text.replace('‑', '-')) - - for bill in range(len(billTitles)): - _bill_title = self._format_presenter_8th(billTitles[bill].text)[0].replace('‑', '-') - _bill_presented_by = self._format_presenter_8th(billTitles[bill].text)[1].replace('‑', '-') - _bill_date = self._format_presenter_8th(billTitles[bill].text)[2].replace('‑', '-') - _bill_url = billTitles[bill].find('a')['href'] - _bill_description = billDescs[bill].replace('‑', '-') - bill_dict = {TITLE: _bill_title, URL: _bill_url, DESCRIPTION: _bill_description, DATE: _bill_date, PRESENTED_BY: _bill_presented_by} - self._bills_data.append(bill_dict) - - def _format_presenter_8th(self, title): - count = 0 - formatted = [] - - for char in title: - if char == '—': - count = count + 1 - - if count > 2: - title = title.replace('—', ' - ', 1) # The bill https://www.legislation.act.gov.au/b/db_47854/default.asp contains an extra '—', this hacks around it. - - splitUp = title.replace('\xa0', ' ').split('—', 2) - formatted.extend([splitUp[0], splitUp[1]]) - if(splitUp[2][0] == ' '): # The Gaming Machine Amendment Bill 2013 (No. 2) contains a space before the "6" in its date (of course), so we need to do this, otherwise funky things happen. - edit = splitUp[2][1:] - splitUp.remove(splitUp[2]) - splitUp.append(edit) - dateSplit = splitUp[2].split(' ', 2) - monthNum = list(calendar.month_name).index(dateSplit[1]) - finalDate = dateSplit[2] + '-' + "{0:0=2d}".format(monthNum) + '-' + "{0:0=2d}".format(int(dateSplit[0])) - formatted.append(finalDate) - return(formatted) - - @property - def data(self): - return(self._bills_data) - -act_all_bills = All_Bills().data - -class act_Bill(object): - def __init__(self, input): - if(isinstance(input, dict)): - try: - self.create_vars(input) - except Exception as e: - raise Exception('Dict must have the correct keys. Missing key ' - + str(e)) - else: - raise TypeError('Input must be valid dict data.') - - - def create_vars(self, init_data): - self._bill_data = init_data - self.url = init_data[URL] - self.date = init_data[DATE] - self.title = init_data[TITLE] - self.description = init_data[DESCRIPTION] - self.presented_by = init_data[PRESENTED_BY] - try: - self.bill_soup = BeautifulSoup(requests.get(self.url).text, 'lxml') - except: - if(self.url == 'file:///%5C%5Cact.gov.au%5Cassembly%5Clasec%5CChamber%5CLA%20Secretariat%20%231%5CNOTICEPAPER%5CBills%5CSummary%20of%20Bills%5CEighth%20Assembly%5CThis%20bill%20will%20establish%20the%20legislative%20framework%20for%20the%20operation%20of%20a%20secure%20mental%20health%20facility%20in%20the%20ACT'): - self.url = 'https://www.legislation.act.gov.au/a/2011-35/' - self.bill_soup = BeautifulSoup(requests.get(self.url).text, 'lxml') - else: - raise Exception('Invalid bill URL, unable to scrape. ' + self.url) - - @property - def bill_type(self): - return(self.get_bill_type()) - - def get_bill_type(self): - basic_data = self.bill_soup.find('dl') - try: - _billtype = basic_data.find_all('dd') - except: - return '' - return(_billtype[0].text) - - @property - def status(self): - return(self.get_bill_status()) - - def get_bill_status(self): - basic_data = self.bill_soup.find('dl') - _billtype = basic_data.find_all('dd') - return(_billtype[2].text) - - @property - def bill_text_url(self): - return(self.get_bill_text()) - - def get_bill_text(self): - try: - a = self.bill_soup.find('a', {'class', 'button viewable pdf'}) - return('https://www.legislation.act.gov.au' + a['href']) - except: - return '' - - @property - def scrutiny_report(self): - return(self.get_scrutiny_report()) - - def get_scrutiny_report(self): - table = self.bill_soup.find('table', {'class': 'datatable display'}) - td = table.find('td', {'class': 'notes'}) - for a in td.find_all('a'): - if(not ' Scrutiny Committee' in a.text and 'Scrutiny Committee' in a.text): - scrutiny_url = a['href'] - try: - return(scrutiny_url) - except: - return('') - - @property - def presentation_speech(self): - table = self.bill_soup.find('table', {'class': 'datatable display'}) - try: - td = table.find('td', {'class': 'notes'}) - for a in td.find_all('a'): - if('Presentation speech' in a.text): - speech_url = a['href'] - try: - return(speech_url) - except: - return('') - except: - return '' - - @property - def hansard(self): - table = self.bill_soup.find('table', {'class': 'datatable display'}) - td = table.find('td', {'class': 'notes'}) - for a in td.find_all('a'): - if('Hansard debate' in a.text): - hansard_url = a['href'] - try: - return(hansard_url) - except: - return('') - - @property - def explanatory_statement(self): - table = self.bill_soup.find_all('table', {'class': 'datatable display'})[1] - a = table.find('a') - try: - return('https://www.legislation.act.gov.au' + a['href']) - except: - return('') - - @property - def compatibility_statement(self): - table = self.bill_soup.find_all('table', {'class': 'datatable display'})[2] - a = table.find('a') - try: - return('https://www.legislation.act.gov.au' + a['href']) - except: - return('') - - @property - def data(self): - self._bill_data[URL] = self.url - self._bill_data[TITLE] = self.title - self._bill_data[DATE] = self.date - self._bill_data[DESCRIPTION] = self.description - self._bill_data[PRESENTED_BY] = self.presented_by - self._bill_data[TYPE] = self.bill_type - self._bill_data[STATUS] = self.status - self._bill_data[TEXT_URL] = self.bill_text_url - self._bill_data[SCRUTINY_REPORT] = self.scrutiny_report - self._bill_data[PRESENTATION_SPEECH] = self.presentation_speech - self._bill_data[HANSARD] = self.hansard - self._bill_data[EXPLANATORY_STATEMENT] = self.explanatory_statement - self._bill_data[COMPATIBILITY_STATEMENT] = self.compatibility_statement - return(self._bill_data) \ No newline at end of file diff --git a/ausbills/nt_parliament.py b/ausbills/nt_parliament.py deleted file mode 100644 index a302070..0000000 --- a/ausbills/nt_parliament.py +++ /dev/null @@ -1,143 +0,0 @@ -from requests import get -from bs4 import BeautifulSoup - -URL = 'url' -SHORT_TITLE = 'short_title' -EXPLANITORY_STATEMENT = 'explanatory_statement' -TEXT_URL = 'text_url' -STATUS = 'status' -SPONSOR = 'sponsor' -SERIAL_NO = 'serial_no' -PARLIAMENT_NO = 'parliament_no' -REMARKS = 'remarks' -INTRO_DATE = 'intro_date' -DATE = 'date' - -nt_api_url = 'https://legislation.nt.gov.au/LegislationPortal/Bills/By-Title' -nt_base_url = 'https://legislation.nt.gov.au' - -class nt_All_Bills(object): - _bills_data = [] - - def __init__(self): - try: - self._scrape_data() - except Exception as e: - raise Exception('Could not create nt_all_bills, ' + e) - - def _scrape_data(self): - _bill_titles = [] - _bill_urls = [] - soup = BeautifulSoup(get(nt_api_url).text, 'lxml') - parent_div = soup.find('div', {'class': 'panel panel-default'}) - bills = parent_div.find_all('a') - for bill in bills: - _bill_urls.append(nt_base_url + '/' + bill['href'][:-9].replace('\n', '')) - if(bill.text[0] == ' '): - _bill_titles.append(bill.text[1:]) - else: - _bill_titles.append(bill.text) - for entry in range(len(_bill_titles)): - bill_dict = {URL: _bill_urls[entry], SHORT_TITLE: _bill_titles[entry]} - self._bills_data.append(bill_dict) - - @property - def data(self): - return(self._bills_data) - -nt_all_bills = nt_All_Bills().data - -class nt_Bill(object): - def __init__(self, input): - if(isinstance(input, dict)): - try: - self.create_vars(input) - except Exception as e: - raise ValueError('Dict must have valid keys, missing key: ' + e) - else: - raise ValueError('Input must be valid nt_Bill data.') - - def create_vars(self, init_data): - self._bill_data = init_data - self.url = init_data[URL] - self.short_title = init_data[SHORT_TITLE] - try: - self.bill_soup = BeautifulSoup(get(self.url).text, 'lxml') - except: - raise Exception('Unable to scrape ' + self.url) - - @property - def explanatory_statement(self): - try: - a = self.bill_soup.find(text='Explanatory Statement:').findNext('a')['href'] - return(a) - except: - return '' - - @property - def text_url(self): - try: - url = self.bill_soup.find_all('div', {'class': 'col-sm-6 text-center'})[1].find('a')['href'] - return(url) - except: - return '' - - @property - def status(self): - try: - fieldset = self.bill_soup.find('fieldset', {'class': 'roundedWhiteBorders'}) - stat = fieldset.find(text='Status:').parent.findNext('span').text.replace('\n', '') - return(stat) - except: - return '' - - @property - def sponsor(self): - return(self._get_span_text('Sponsor:')) - - @property - def serial_no(self): - return(self._get_span_text('Serial No:')) - - @property - def parliament_no(self): - return(self._get_span_text('Assembly:')) - - @property - def remarks(self): - return(self._get_span_text('Remarks:')) - - @property - def intro_date(self): - return(self._get_span_date('Introduced:')) - - @property - def date(self): - return(self._get_span_date('Date:')) - - def _get_span_text(self, input_text): - try: - span = self.bill_soup.find(text=input_text).findNext('span').text - return(span) - except: - return '' - - def _get_span_date(self, input_text): - try: - span = self.bill_soup.find(text=input_text).findNext('span').text - return(span.replace('/', '-')) - except: - return '' - - @property - def data(self): - self._bill_data[DATE] = self.date - self._bill_data[INTRO_DATE] = self.intro_date - self._bill_data[REMARKS] = self.remarks - self._bill_data[PARLIAMENT_NO] = self.parliament_no - self._bill_data[SERIAL_NO] = self.serial_no - self._bill_data[SPONSOR] = self.sponsor - self._bill_data[STATUS] = self.status - self._bill_data[TEXT_URL] = self.text_url - self._bill_data[EXPLANITORY_STATEMENT] = self.explanatory_statement - return(self._bill_data) \ No newline at end of file diff --git a/ausbills/qld_parliament.py b/ausbills/qld_parliament.py deleted file mode 100644 index c04af0c..0000000 --- a/ausbills/qld_parliament.py +++ /dev/null @@ -1,150 +0,0 @@ -import re -import json -from bs4 import BeautifulSoup -from datetime import datetime -from requests import get - -url_split_1 = 'https://www.legislation.qld.gov.au/projectdata?ds=OQPC-BrowseDataSource&start=1&count=9999&sortDirection=asc&expression=PrintType%3D(%22bill.first%22+OR+%22bill.firstnongovintro%22)+AND+Year%3D' -url_split_2 = '%3F&subset=browse&collection=&_=1603523834238' -current_year = datetime.today().year - -ID = 'id' -URL = 'url' -PRINT_TYPE = 'print_type' -PARLIAMENT_NO = 'parliament_no' -DATE = 'date' -TITLE = 'title' -BILL_NUMBER = 'bill_number' -SERIES_ID = 'series_id' # These 2 variables (series/desc)_id are used by the QLD bill API to deliver specific HTML fragments. -DESC_ID = 'desc_id' # Without these, we'd need some funky JavaScript interpretation, and you know I'm too lazy to write that. -EXPLANATORY_NOTE = 'explanatory_note' -RENDITIONS = 'renditions' -SPONSOR = 'sponsor' -LONG_TITLE = 'long_title' -BILL_TYPE = 'bill_type' - -class qld_All_Bills(object): - _bills_data = [] - - def __init__(self): - try: - self._create_dataset() - except: - raise Exception('Error when scraping lists...') - - def _create_dataset(self): - for year in range(current_year - (current_year - 1992), current_year + 1): - bill_list = json.loads(get(url_split_1 + str(year) + url_split_2).text) - for bill in bill_list['data']: - _id = bill['id']['__value__'] - _print_type = bill['print.type']['__value__'] - _url = 'https://www.legislation.qld.gov.au/view/html/' + _print_type + '/' + _id - _parliament_no = bill['parliament.no']['__value__'] - _date = bill['publication.date'][:-9] - _title = bill['title']['__value__'].replace('’', '\'').replace('\u2014', ' - ').replace('\u2013', ' - ') - _bill_number = bill['no']['__value__'] - _series_id = bill['version.series.id']['__value__'] - _desc_id = bill['version.desc.id']['__value__'] - _bill_dict = {ID: _id, PRINT_TYPE: _print_type, URL: _url, PARLIAMENT_NO: _parliament_no, DATE: _date, TITLE: _title, BILL_NUMBER: _bill_number, DESC_ID: _desc_id, SERIES_ID: _series_id} - self._bills_data.append(_bill_dict) - - @property - def data(self): - return(self._bills_data) - -qld_all_bills = qld_All_Bills().data - -class qld_Bill(object): - def __init__(self, input): - if(isinstance(input, dict)): - try: - self.create_vars(input) - except Exception as e: - raise Exception('Dict must have the correct keys. Missing key ' - + str(e)) - else: - raise TypeError('Input must be a valid QLD bill...') - - def create_vars(self, init_data): - self._bill_data = init_data - self.url = init_data[URL] - self.id = init_data[ID] - self.short_title = init_data[TITLE] - self.date = init_data[DATE] - self.print_type = init_data[PRINT_TYPE] - self.parliament_no = init_data[PARLIAMENT_NO] - self.bill_number = init_data[BILL_NUMBER] - self.series_id = init_data[SERIES_ID] - self.desc_id = init_data[DESC_ID] - try: - json_url = 'https://www.legislation.qld.gov.au/projectdata?ds=OQPC-TocDataSource&expression=view%2Fhtml%2Fbill.first%2F' + self.id + '&subset=search' - self.bill_json = json.loads(get(json_url).text) - except: - raise Exception('Unable to scrape, ' + self.url) - try: - history_url = 'https://www.legislation.qld.gov.au/view/html/' + self.print_type + '/' + self.id + '/lh' - self.bill_history_soup = BeautifulSoup(get(history_url).text, 'lxml') - except: - raise Exception('Unable to scrape bill history, ' + self.url) - - @property - def renditions(self): - try: - renditions = [] - rendition_info = self.bill_json['version.info'] - rendition_info.pop(0) - for version in rendition_info: - rendition_dict = {ID: version['id']['__value__'], PRINT_TYPE: version['print.type']['__value__'], DATE: version['publication.date']} - renditions.append(rendition_dict) - return(renditions) - except: - return [] - - @property - def sponsor(self): - try: - bill_sponsor = self.bill_json['member.id']['__value__'] - return(bill_sponsor) - except: - return '' - - @property - def long_title(self): - try: - html_data = BeautifulSoup(json.loads(get('https://www.legislation.qld.gov.au/projectdata?ds=OQPC-FragViewDataSource&expression=VersionDescId%3D%22' + self.desc_id + '%22+AND+VersionSeriesId%3D%22' + self.series_id + '%22+AND+PrintType%3D%22bill.first%22+AND+Id_p%3D%22frnt-lt%22%7C%7Cas.made&collection=OQPC.fragment&subset=search').text)['frag.html'], 'lxml') - title_val = html_data.text.replace('\n', '').replace('\t', '') - return(title_val) - except: - return '' - - @property - def bill_type(self): - try: - div = self.bill_history_soup.find('div', {'id': 'parsewrapper'}) - table = div.find('table', {'class': 'table table-striped'}) - _type = table.find('tr').text.replace('\n', '') - return(_type) - except: - return '' - - @property - def explanatory_note(self): - try: - div = self.bill_history_soup.find('div', {'id': 'parsewrapper'}) - table = div.find('table', {'class': 'table table-striped'}) - td = table.find_all('tr')[1].find_all('td')[1] - for paragraph in td.find_all('a'): - if 'Explanatory Note' in paragraph.text.replace('\n', ' '): - return('https://www.legislation.qld.gov.au' + paragraph['href']) - return '' - except: - return '' - - @property - def data(self): - self._bill_data[EXPLANATORY_NOTE] = self.explanatory_note - self._bill_data[SPONSOR] = self.sponsor - self._bill_data[RENDITIONS] = self.renditions - self._bill_data[LONG_TITLE] = self.renditions - self._bill_data[BILL_TYPE] = self.bill_type - return(self._bill_data) \ No newline at end of file diff --git a/ausbills/sa_parliament.py b/ausbills/sa_parliament.py deleted file mode 100644 index c22d53d..0000000 --- a/ausbills/sa_parliament.py +++ /dev/null @@ -1,93 +0,0 @@ -from requests import get -from bs4 import BeautifulSoup - -bill_list_urls = ['https://legislation.sa.gov.au/listBills.aspx?key=', 'https://legislation.sa.gov.au/listAZBills.aspx?key='] -sa_base_url = 'https://legislation.sa.gov.au/' - -URL = 'url' -SHORT_TITLE = 'short_title' -SPONSOR = 'sponsor' -TEXTS = 'texts' - -class sa_All_Bills(object): - _bills_data = [] - - def __init__(self): - try: - self.create_dataset() - except: - raise Exception('An error ocurred when trying to scrape bills...') - - def create_dataset(self): - _bill_titles = [] - _bill_urls = [] - for list_url in bill_list_urls: - table = BeautifulSoup(get(list_url).text, 'lxml').find('table', {'summary': 'A List of the various versions of this Bills beginning with this letter'}).find('tbody') - for row in table.find_all('tr'): - _bill_urls.append(sa_base_url + row.find('a')['href'].replace(' ', '%20')) - _bill_titles.append(row.find('a').text.replace('\n', '').replace('\r', ' ').replace('\xa0', ' ').replace(' ', ' ')) - for bill in range(len(_bill_titles)): - if('—introduced by' in _bill_titles[bill]): - title_split = _bill_titles[bill].split('—introduced by') - bill_dict = {URL: _bill_urls[bill], SHORT_TITLE: title_split[0], SPONSOR: title_split[1]} - self._bills_data.append(bill_dict) - else: - bill_dict = {URL: _bill_urls[bill], SHORT_TITLE: _bill_titles[bill], SPONSOR: ''} - self._bills_data.append(bill_dict) - - @property - def data(self): - return(self._bills_data) - -sa_all_bills = sa_All_Bills().data - -class sa_Bill(object): - def __init__(self, input): - if(isinstance(input, dict)): - try: - self.create_vars(input) - except Exception as e: - raise ValueError('Dict must have correct keys, missing key ' + e) - else: - raise ValueError('Input must be valid sa_Bill dict data...') - - def create_vars(self, init_data): - self._bill_data = init_data - self.url = init_data[URL] - self.short_title = init_data[SHORT_TITLE] - try: - self.bill_soup = BeautifulSoup(get(self.url).text, 'lxml') - except: - raise Exception('Unable to scrape ' + self.url) - - @property - def sponsor(self): - if(self._bill_data[SPONSOR] == ''): - try: - text = self.bill_soup.find('div', {'class': 'ItemIntroducedBy'}).find('p').text - return(text) - except: - return '' - else: - return(self._bill_data[SPONSOR][1:]) - - @property - def texts(self): - try: - data_list = [] - table_body = self.bill_soup.find('table', {'summary': 'A List of the various stages of this Bill'}).find('tbody') - links = table_body.find_all('a', {'title': 'View document in PDF in new window'}) - for link in links: - data_url = sa_base_url + link['href'].replace(' ', '%20') - data_text = link.parent.findPrevious('td').text - data_dict = {data_text: data_url} - data_list.append(data_dict) - return(data_list) - except: - return [] - - @property - def data(self): - self._bill_data[TEXTS] = self.texts - self._bill_data[SPONSOR] = self.sponsor - return(self._bill_data) \ No newline at end of file diff --git a/ausbills/tas_parliament.py b/ausbills/tas_parliament.py deleted file mode 100644 index 8bf0c48..0000000 --- a/ausbills/tas_parliament.py +++ /dev/null @@ -1,205 +0,0 @@ -from datetime import datetime -from requests import get -from bs4 import BeautifulSoup - -current_year = datetime.today().year -url_split = ['https://www.parliament.tas.gov.au/bills/Bills', '/BillsWeb', '.htm'] - -URL = 'url' -TITLE = 'title' -YEAR = 'year' -PASSED_LOWER = 'passed_lower' -PASSED_UPPER = 'passed_upper' -SPONSOR = 'sponsor' -BILL_TEXT_URL = 'bill_text_url' -WAS_AMENDED_UPPER = 'was_amended_upper' -WAS_AMENDED_LOWER = 'was_amended_lower' -ASSENTED = 'assented' -ACT_NO = 'act_no' -LOWER_FIRST_READING = 'lower_first_reading' -LOWER_SECOND_READING = 'lower_second_reading' -LOWER_THIRD_READING = 'lower_third_reading' -UPPER_FIRST_READING = 'upper_first_reading' -UPPER_SECOND_READING = 'upper_second_reading' -UPPER_THIRD_READING = 'upper_third_reading' - -class tas_All_Bills(object): - _bills_data = [] - - def __init__(self): - try: - self.create_dataset() - except: - raise Exception('Error when scraping bills...') - - def create_dataset(self): - for year in range(current_year - (current_year - 2002), current_year + 1): - soup = BeautifulSoup(get(url_split[0] + str(year) + url_split[1] + str(year) + url_split[2]).text, 'lxml') - table = soup.find('table', {'bordercolor': '#CCCCCC'}) - bills = table.find_all('a') - _bill_urls = [] - _bill_titles = [] - for bill in bills: - _bill_titles.append(bill.text.strip()) - _bill_urls.append(url_split[0] + str(year) + '/' + bill['href']) - for bill in range(len(_bill_titles)): - bill_dict = {URL: _bill_urls[bill], TITLE: _bill_titles[bill], YEAR: str(year)} - self._bills_data.append(bill_dict) - - @property - def data(self): - return(self._bills_data) - -tas_all_bills = tas_All_Bills().data - -class tas_Bill(object): - def __init__(self, input): - if(isinstance(input, dict)): - try: - self.create_vars(input) - except Exception as e: - raise Exception('Dict must have correct keys, missing key ' + e) - else: - raise ValueError('Input data must be valid tas_Bill dict data...') - - def create_vars(self, init_data): - self._bill_data = init_data - self.url = init_data[URL] - self.title = init_data[TITLE] - try: - self.bill_soup = BeautifulSoup(get(self.url).text, 'lxml') - table = self.bill_soup.find('table', {'bordercolor': '#CCCCCC'}) - self._rows = table.find_all('tr') - except: - raise Exception('Unable to scrape ' + self.url) - self.get_first_readings() - self.get_second_readings() - self.get_third_readings() - - @property - def sponsor(self): - rows = self._rows - try: - return(rows[0].find('td', {'colspan': '4'}).text.replace(' Introduced by: ', '').strip()) - except: - return '' - - @property - def bill_text_url(self): - try: - return(url_split[0] + self._bill_data[YEAR] + '/' + self._rows[1].find('a')['href']) - except: - return '' - - @property - def was_amended_upper(self): - return(self.amended_check()[1]) - - @property - def was_amended_lower(self): - return(self.amended_check()[0]) - - @property - def passed_lower(self): - column = self._rows[12:][0].find('td') - try: - return(self.format_date(column.find('br').previousSibling.strip().replace('HA Agreed: ', ''))) - except: - return False - - @property - def passed_upper(self): - column = self._rows[12:][0].find('td') - try: - return(self.format_date(column.find('br').nextSibling.strip().replace('Agreed Both: ', ''))) - except: - return False - - @property - def assented(self): - column = self._rows[12:][0].find('td') - try: - return(self.format_date(column.find_all('br', limit=2)[-1].nextSibling.strip().replace('Royal Assent: ', ''))) - except: - return False - - @property - def act_no(self): - column = self._rows[12:][0].find('td') - try: - return(column.find_all('br', limit=4)[-1].nextSibling.replace('Act Number:', '').strip()) - except: - return '' - - def amended_check(self): - columns = self._rows[9:][0].find_all('td') - lower = columns[1].text.strip() - upper = columns[3].text.strip() - if(lower == 'Yes'): - lower = True - else: - lower = False - - if(upper == 'Yes'): - upper = True - else: - upper = False - return[lower, upper] - - def get_first_readings(self): - columns = self._rows[3:][0].find_all('td') - try: - self.lower_first_reading = self.format_date(columns[1].text.strip()) - except: - self.lower_first_reading = '' - - try: - self.upper_first_reading = self.format_date(columns[3].text.strip()) - except: - self.upper_first_reading = '' - - def get_second_readings(self): - columns = self._rows[5:][0].find_all('td') - try: - self.lower_second_reading = self.format_date(columns[1].text.strip()) - except: - self.lower_second_reading = '' - - try: - self.upper_second_reading = self.format_date(columns[3].text.strip()) - except: - self.upper_second_reading = '' - - def get_third_readings(self): - columns = self._rows[10:][0].find_all('td') - try: - self.lower_third_reading = self.format_date(columns[1].text.strip()) - except: - self.lower_third_reading = '' - - try: - self.upper_third_reading = self.format_date(columns[3].text.strip()) - except: - self.upper_third_reading = '' - - def format_date(self, input_date): - dateSplit = input_date.split('/', 2) - return("{:0>2s}".format(dateSplit[0]) + '-' "{:0>2s}".format(dateSplit[1]) + '-' + dateSplit[2]) - - @property - def data(self): - self._bill_data[PASSED_LOWER] = self.passed_lower - self._bill_data[PASSED_UPPER] = self.passed_upper - self._bill_data[LOWER_FIRST_READING] = self.lower_first_reading - self._bill_data[LOWER_SECOND_READING] = self.lower_second_reading - self._bill_data[LOWER_THIRD_READING] = self.lower_third_reading - self._bill_data[UPPER_FIRST_READING] = self.upper_first_reading - self._bill_data[UPPER_SECOND_READING] = self.upper_second_reading - self._bill_data[UPPER_THIRD_READING] = self.upper_third_reading - self._bill_data[SPONSOR] = self.sponsor - self._bill_data[BILL_TEXT_URL] = self.bill_text_url - self._bill_data[ASSENTED] = self.assented - self._bill_data[WAS_AMENDED_LOWER] = self.was_amended_lower - self._bill_data[WAS_AMENDED_UPPER] = self.was_amended_upper - self._bill_data[ACT_NO] = self.act_no - return(self._bill_data) \ No newline at end of file From 7afbb1fcefb9e9f4b601962d73c1190b41da41d2 Mon Sep 17 00:00:00 2001 From: king-millez Date: Wed, 28 Oct 2020 17:36:53 +1100 Subject: [PATCH 6/6] Restore old versions of other parliaments --- ausbills/act_parliament.py | 291 +++++++++++++++++++++++++++++++++++++ ausbills/nt_parliament.py | 145 ++++++++++++++++++ ausbills/qld_parliament.py | 152 +++++++++++++++++++ ausbills/sa_parliament.py | 95 ++++++++++++ ausbills/tas_parliament.py | 207 ++++++++++++++++++++++++++ 5 files changed, 890 insertions(+) create mode 100644 ausbills/act_parliament.py create mode 100644 ausbills/nt_parliament.py create mode 100644 ausbills/qld_parliament.py create mode 100644 ausbills/sa_parliament.py create mode 100644 ausbills/tas_parliament.py diff --git a/ausbills/act_parliament.py b/ausbills/act_parliament.py new file mode 100644 index 0000000..1e49310 --- /dev/null +++ b/ausbills/act_parliament.py @@ -0,0 +1,291 @@ +from bs4 import BeautifulSoup +import requests +import datetime +import calendar +import re + +DATE = 'date' +URL = 'url' +TITLE = 'title' +DESCRIPTION = 'description' +PRESENTED_BY = 'presented_by' +TYPE = 'type' +STATUS = 'status' +TEXT_URL = 'text_url' +SCRUTINY_REPORT = 'scrutiny_report' +PRESENTATION_SPEECH = 'presentation_speech' +HANSARD = 'hansard' +EXPLANATORY_STATEMENT = 'explanatory_statement' +COMPATIBILITY_STATEMENT = 'compatibility_statement' + +ninth_assembly_bills = "https://www.parliament.act.gov.au/parliamentary-business/in-the-chamber/bills/summary_of_bills" +eighth_assembly_bills = "https://www.parliament.act.gov.au/parliamentary-business/in-the-chamber/previous-assemblies/eighth-assembly/summary_of_bills" +ninth_siteData = requests.get(ninth_assembly_bills).text +eighth_siteData = requests.get(eighth_assembly_bills).text + +class All_Bills(object): + _bills_data = [] # This list will end up containing all the bill dict entries, and is the data returned. + + def __init__(self): + self._build_dataset() + + def _build_dataset(self): + try: + self._scrape_9th_assembly() + except Exception as e: + print('An exception ocurred when trying to scrape the 9th Assembly:\n') + print(e) + + try: + self._scrape_8th_assembly() + except Exception as e: + print('An exception ocurred when trying to scrape the 8th Assembly:\n') + print(e) + + def _scrape_9th_assembly(self): + billPres = [] + billDescs = [] + + soup = BeautifulSoup(ninth_siteData, 'html.parser') + div = soup.find("div", {"id": "main"}) + billTitles = div.find_all('h4') + for h4 in div.find_all('h4'): + h4.replace_with('') # Remove all

s from the soup, this makes it less annoying to get the bill presenter string from tags. The ACT Government, man, it's weird. + + billData = div.find_all(re.compile(r'(div|p)')) + allStrong = div.find_all('strong') + for strong in range(len(allStrong)): + if('This bill' in allStrong[strong].text or 'e bill will also' in allStrong[strong].text): + pass + else: + billPres.append(allStrong[strong]) + for entry in billData: + if "This bill" in entry.text or "this bill" in entry.text: + billDescs.append(entry) + + for title in range(len(billTitles)): # Here we loop through every bill and compile its information into an entry in _bills_data + _bill_title = billTitles[title].text + a = billTitles[title].find('a') + if(a == None): + if('Aboriginal and Torres Strait Islander Elected Body Amendment Bill 2020' in billTitles[title].text): + _bill_url = 'https://www.legislation.act.gov.au/a/2020-36/' + else: + _bill_url = '' + else: + _bill_url = a['href'] + _bill_description = billDescs[title].text + _bill_presented_by = self._format_presenter_9th(billPres[title].text)[0][13:] + _bill_date = self._format_presenter_9th(billPres[title].text)[1] + + bill_dict = {URL: _bill_url, TITLE: _bill_title, DESCRIPTION: _bill_description.replace("\xa0\xa0", " ").replace('‑', '-'), PRESENTED_BY: _bill_presented_by, DATE: _bill_date} + + self._bills_data.append(bill_dict) + + def _format_presenter_9th(self, title): + formatted = [] + splitUp = title.replace('\xa0', ' ').split('—', 1) + formatted.append(splitUp[0]) + dateSplit = splitUp[1].split(' ', 2) + monthNum = list(calendar.month_name).index(dateSplit[1]) + finalDate = dateSplit[2] + '-' + "{0:0=2d}".format(monthNum) + '-' + "{0:0=2d}".format(int(dateSplit[0])) + formatted.append(finalDate) + return(formatted) + + def _scrape_8th_assembly(self): + billDescs = [] + billTitles = [] + billScrutinyReports = [] + + soup = BeautifulSoup(eighth_siteData, 'html.parser') + div = soup.find('div', {'id': 'main'}) + paras = div.find_all('p')[8:] + for p in range(len(paras)): + if "" in str(paras[p]): + billTitles.append(paras[p]) + + elif "y Report " in str(paras[p]) or "Statement" in str(paras[p]): + reports = paras[p].find_all('a', {'href': True}) + urls = [] + for report in range(len(reports)): + urls.append(reports[report]['href']) + billScrutinyReports.append(urls) + + else: + billDescs.append(paras[p].text.replace('‑', '-')) + + for bill in range(len(billTitles)): + _bill_title = self._format_presenter_8th(billTitles[bill].text)[0].replace('‑', '-') + _bill_presented_by = self._format_presenter_8th(billTitles[bill].text)[1].replace('‑', '-') + _bill_date = self._format_presenter_8th(billTitles[bill].text)[2].replace('‑', '-') + _bill_url = billTitles[bill].find('a')['href'] + _bill_description = billDescs[bill].replace('‑', '-') + bill_dict = {TITLE: _bill_title, URL: _bill_url, DESCRIPTION: _bill_description, DATE: _bill_date, PRESENTED_BY: _bill_presented_by} + self._bills_data.append(bill_dict) + + def _format_presenter_8th(self, title): + count = 0 + formatted = [] + + for char in title: + if char == '—': + count = count + 1 + + if count > 2: + title = title.replace('—', ' - ', 1) # The bill https://www.legislation.act.gov.au/b/db_47854/default.asp contains an extra '—', this hacks around it. + + splitUp = title.replace('\xa0', ' ').split('—', 2) + formatted.extend([splitUp[0], splitUp[1]]) + if(splitUp[2][0] == ' '): # The Gaming Machine Amendment Bill 2013 (No. 2) contains a space before the "6" in its date (of course), so we need to do this, otherwise funky things happen. + edit = splitUp[2][1:] + splitUp.remove(splitUp[2]) + splitUp.append(edit) + dateSplit = splitUp[2].split(' ', 2) + monthNum = list(calendar.month_name).index(dateSplit[1]) + finalDate = dateSplit[2] + '-' + "{0:0=2d}".format(monthNum) + '-' + "{0:0=2d}".format(int(dateSplit[0])) + formatted.append(finalDate) + return(formatted) + + @property + def data(self): + return(self._bills_data) + +act_all_bills = All_Bills().data + +class act_Bill(object): + _all_bills = act_all_bills + + def __init__(self, input): + if(isinstance(input, dict)): + try: + self.create_vars(input) + except Exception as e: + raise Exception('Dict must have the correct keys. Missing key ' + + str(e)) + else: + raise TypeError('Input must be valid dict data.') + + + def create_vars(self, init_data): + self._bill_data = init_data + self.url = init_data[URL] + self.date = init_data[DATE] + self.title = init_data[TITLE] + self.description = init_data[DESCRIPTION] + self.presented_by = init_data[PRESENTED_BY] + try: + self.bill_soup = BeautifulSoup(requests.get(self.url).text, 'lxml') + except: + if(self.url == 'file:///%5C%5Cact.gov.au%5Cassembly%5Clasec%5CChamber%5CLA%20Secretariat%20%231%5CNOTICEPAPER%5CBills%5CSummary%20of%20Bills%5CEighth%20Assembly%5CThis%20bill%20will%20establish%20the%20legislative%20framework%20for%20the%20operation%20of%20a%20secure%20mental%20health%20facility%20in%20the%20ACT'): + self.url = 'https://www.legislation.act.gov.au/a/2011-35/' + self.bill_soup = BeautifulSoup(requests.get(self.url).text, 'lxml') + else: + raise Exception('Invalid bill URL, unable to scrape. ' + self.url) + + @property + def bill_type(self): + return(self.get_bill_type()) + + def get_bill_type(self): + basic_data = self.bill_soup.find('dl') + try: + _billtype = basic_data.find_all('dd') + except: + return '' + return(_billtype[0].text) + + @property + def status(self): + return(self.get_bill_status()) + + def get_bill_status(self): + basic_data = self.bill_soup.find('dl') + _billtype = basic_data.find_all('dd') + return(_billtype[2].text) + + @property + def bill_text_url(self): + return(self.get_bill_text()) + + def get_bill_text(self): + try: + a = self.bill_soup.find('a', {'class', 'button viewable pdf'}) + return('https://www.legislation.act.gov.au' + a['href']) + except: + return '' + + @property + def scrutiny_report(self): + return(self.get_scrutiny_report()) + + def get_scrutiny_report(self): + table = self.bill_soup.find('table', {'class': 'datatable display'}) + td = table.find('td', {'class': 'notes'}) + for a in td.find_all('a'): + if(not ' Scrutiny Committee' in a.text and 'Scrutiny Committee' in a.text): + scrutiny_url = a['href'] + try: + return(scrutiny_url) + except: + return('') + + @property + def presentation_speech(self): + table = self.bill_soup.find('table', {'class': 'datatable display'}) + try: + td = table.find('td', {'class': 'notes'}) + for a in td.find_all('a'): + if('Presentation speech' in a.text): + speech_url = a['href'] + try: + return(speech_url) + except: + return('') + except: + return '' + + @property + def hansard(self): + table = self.bill_soup.find('table', {'class': 'datatable display'}) + td = table.find('td', {'class': 'notes'}) + for a in td.find_all('a'): + if('Hansard debate' in a.text): + hansard_url = a['href'] + try: + return(hansard_url) + except: + return('') + + @property + def explanatory_statement(self): + table = self.bill_soup.find_all('table', {'class': 'datatable display'})[1] + a = table.find('a') + try: + return('https://www.legislation.act.gov.au' + a['href']) + except: + return('') + + @property + def compatibility_statement(self): + table = self.bill_soup.find_all('table', {'class': 'datatable display'})[2] + a = table.find('a') + try: + return('https://www.legislation.act.gov.au' + a['href']) + except: + return('') + + @property + def data(self): + self._bill_data[URL] = self.url + self._bill_data[TITLE] = self.title + self._bill_data[DATE] = self.date + self._bill_data[DESCRIPTION] = self.description + self._bill_data[PRESENTED_BY] = self.presented_by + self._bill_data[TYPE] = self.bill_type + self._bill_data[STATUS] = self.status + self._bill_data[TEXT_URL] = self.bill_text_url + self._bill_data[SCRUTINY_REPORT] = self.scrutiny_report + self._bill_data[PRESENTATION_SPEECH] = self.presentation_speech + self._bill_data[HANSARD] = self.hansard + self._bill_data[EXPLANATORY_STATEMENT] = self.explanatory_statement + self._bill_data[COMPATIBILITY_STATEMENT] = self.compatibility_statement + return(self._bill_data) \ No newline at end of file diff --git a/ausbills/nt_parliament.py b/ausbills/nt_parliament.py new file mode 100644 index 0000000..4a894c9 --- /dev/null +++ b/ausbills/nt_parliament.py @@ -0,0 +1,145 @@ +from requests import get +from bs4 import BeautifulSoup + +URL = 'url' +SHORT_TITLE = 'short_title' +EXPLANITORY_STATEMENT = 'explanatory_statement' +TEXT_URL = 'text_url' +STATUS = 'status' +SPONSOR = 'sponsor' +SERIAL_NO = 'serial_no' +PARLIAMENT_NO = 'parliament_no' +REMARKS = 'remarks' +INTRO_DATE = 'intro_date' +DATE = 'date' + +nt_api_url = 'https://legislation.nt.gov.au/LegislationPortal/Bills/By-Title' +nt_base_url = 'https://legislation.nt.gov.au' + +class nt_All_Bills(object): + _bills_data = [] + + def __init__(self): + try: + self._scrape_data() + except Exception as e: + raise Exception('Could not create nt_all_bills, ' + e) + + def _scrape_data(self): + _bill_titles = [] + _bill_urls = [] + soup = BeautifulSoup(get(nt_api_url).text, 'lxml') + parent_div = soup.find('div', {'class': 'panel panel-default'}) + bills = parent_div.find_all('a') + for bill in bills: + _bill_urls.append(nt_base_url + '/' + bill['href'][:-9].replace('\n', '')) + if(bill.text[0] == ' '): + _bill_titles.append(bill.text[1:]) + else: + _bill_titles.append(bill.text) + for entry in range(len(_bill_titles)): + bill_dict = {URL: _bill_urls[entry], SHORT_TITLE: _bill_titles[entry]} + self._bills_data.append(bill_dict) + + @property + def data(self): + return(self._bills_data) + +nt_all_bills = nt_All_Bills().data + +class nt_Bill(object): + _nt_all_bills = nt_all_bills + + def __init__(self, input): + if(isinstance(input, dict)): + try: + self.create_vars(input) + except Exception as e: + raise ValueError('Dict must have valid keys, missing key: ' + e) + else: + raise ValueError('Input must be valid nt_Bill data.') + + def create_vars(self, init_data): + self._bill_data = init_data + self.url = init_data[URL] + self.short_title = init_data[SHORT_TITLE] + try: + self.bill_soup = BeautifulSoup(get(self.url).text, 'lxml') + except: + raise Exception('Unable to scrape ' + self.url) + + @property + def explanatory_statement(self): + try: + a = self.bill_soup.find(text='Explanatory Statement:').findNext('a')['href'] + return(a) + except: + return '' + + @property + def text_url(self): + try: + url = self.bill_soup.find_all('div', {'class': 'col-sm-6 text-center'})[1].find('a')['href'] + return(url) + except: + return '' + + @property + def status(self): + try: + fieldset = self.bill_soup.find('fieldset', {'class': 'roundedWhiteBorders'}) + stat = fieldset.find(text='Status:').parent.findNext('span').text.replace('\n', '') + return(stat) + except: + return '' + + @property + def sponsor(self): + return(self._get_span_text('Sponsor:')) + + @property + def serial_no(self): + return(self._get_span_text('Serial No:')) + + @property + def parliament_no(self): + return(self._get_span_text('Assembly:')) + + @property + def remarks(self): + return(self._get_span_text('Remarks:')) + + @property + def intro_date(self): + return(self._get_span_date('Introduced:')) + + @property + def date(self): + return(self._get_span_date('Date:')) + + def _get_span_text(self, input_text): + try: + span = self.bill_soup.find(text=input_text).findNext('span').text + return(span) + except: + return '' + + def _get_span_date(self, input_text): + try: + span = self.bill_soup.find(text=input_text).findNext('span').text + return(span.replace('/', '-')) + except: + return '' + + @property + def data(self): + self._bill_data[DATE] = self.date + self._bill_data[INTRO_DATE] = self.intro_date + self._bill_data[REMARKS] = self.remarks + self._bill_data[PARLIAMENT_NO] = self.parliament_no + self._bill_data[SERIAL_NO] = self.serial_no + self._bill_data[SPONSOR] = self.sponsor + self._bill_data[STATUS] = self.status + self._bill_data[TEXT_URL] = self.text_url + self._bill_data[EXPLANITORY_STATEMENT] = self.explanatory_statement + return(self._bill_data) \ No newline at end of file diff --git a/ausbills/qld_parliament.py b/ausbills/qld_parliament.py new file mode 100644 index 0000000..1b44783 --- /dev/null +++ b/ausbills/qld_parliament.py @@ -0,0 +1,152 @@ +import re +import json +from bs4 import BeautifulSoup +from datetime import datetime +from requests import get + +url_split_1 = 'https://www.legislation.qld.gov.au/projectdata?ds=OQPC-BrowseDataSource&start=1&count=9999&sortDirection=asc&expression=PrintType%3D(%22bill.first%22+OR+%22bill.firstnongovintro%22)+AND+Year%3D' +url_split_2 = '%3F&subset=browse&collection=&_=1603523834238' +current_year = datetime.today().year + +ID = 'id' +URL = 'url' +PRINT_TYPE = 'print_type' +PARLIAMENT_NO = 'parliament_no' +DATE = 'date' +TITLE = 'title' +BILL_NUMBER = 'bill_number' +SERIES_ID = 'series_id' # These 2 variables (series/desc)_id are used by the QLD bill API to deliver specific HTML fragments. +DESC_ID = 'desc_id' # Without these, we'd need some funky JavaScript interpretation, and you know I'm too lazy to write that. +EXPLANATORY_NOTE = 'explanatory_note' +RENDITIONS = 'renditions' +SPONSOR = 'sponsor' +LONG_TITLE = 'long_title' +BILL_TYPE = 'bill_type' + +class qld_All_Bills(object): + _bills_data = [] + + def __init__(self): + try: + self._create_dataset() + except: + raise Exception('Error when scraping lists...') + + def _create_dataset(self): + for year in range(current_year - (current_year - 1992), current_year + 1): + bill_list = json.loads(get(url_split_1 + str(year) + url_split_2).text) + for bill in bill_list['data']: + _id = bill['id']['__value__'] + _print_type = bill['print.type']['__value__'] + _url = 'https://www.legislation.qld.gov.au/view/html/' + _print_type + '/' + _id + _parliament_no = bill['parliament.no']['__value__'] + _date = bill['publication.date'][:-9] + _title = bill['title']['__value__'].replace('’', '\'').replace('\u2014', ' - ').replace('\u2013', ' - ') + _bill_number = bill['no']['__value__'] + _series_id = bill['version.series.id']['__value__'] + _desc_id = bill['version.desc.id']['__value__'] + _bill_dict = {ID: _id, PRINT_TYPE: _print_type, URL: _url, PARLIAMENT_NO: _parliament_no, DATE: _date, TITLE: _title, BILL_NUMBER: _bill_number, DESC_ID: _desc_id, SERIES_ID: _series_id} + self._bills_data.append(_bill_dict) + + @property + def data(self): + return(self._bills_data) + +qld_all_bills = qld_All_Bills().data + +class qld_Bill(object): + _all_bills = qld_all_bills + + def __init__(self, input): + if(isinstance(input, dict)): + try: + self.create_vars(input) + except Exception as e: + raise Exception('Dict must have the correct keys. Missing key ' + + str(e)) + else: + raise TypeError('Input must be a valid QLD bill...') + + def create_vars(self, init_data): + self._bill_data = init_data + self.url = init_data[URL] + self.id = init_data[ID] + self.short_title = init_data[TITLE] + self.date = init_data[DATE] + self.print_type = init_data[PRINT_TYPE] + self.parliament_no = init_data[PARLIAMENT_NO] + self.bill_number = init_data[BILL_NUMBER] + self.series_id = init_data[SERIES_ID] + self.desc_id = init_data[DESC_ID] + try: + json_url = 'https://www.legislation.qld.gov.au/projectdata?ds=OQPC-TocDataSource&expression=view%2Fhtml%2Fbill.first%2F' + self.id + '&subset=search' + self.bill_json = json.loads(get(json_url).text) + except: + raise Exception('Unable to scrape, ' + self.url) + try: + history_url = 'https://www.legislation.qld.gov.au/view/html/' + self.print_type + '/' + self.id + '/lh' + self.bill_history_soup = BeautifulSoup(get(history_url).text, 'lxml') + except: + raise Exception('Unable to scrape bill history, ' + self.url) + + @property + def renditions(self): + try: + renditions = [] + rendition_info = self.bill_json['version.info'] + rendition_info.pop(0) + for version in rendition_info: + rendition_dict = {ID: version['id']['__value__'], PRINT_TYPE: version['print.type']['__value__'], DATE: version['publication.date']} + renditions.append(rendition_dict) + return(renditions) + except: + return [] + + @property + def sponsor(self): + try: + bill_sponsor = self.bill_json['member.id']['__value__'] + return(bill_sponsor) + except: + return '' + + @property + def long_title(self): + try: + html_data = BeautifulSoup(json.loads(get('https://www.legislation.qld.gov.au/projectdata?ds=OQPC-FragViewDataSource&expression=VersionDescId%3D%22' + self.desc_id + '%22+AND+VersionSeriesId%3D%22' + self.series_id + '%22+AND+PrintType%3D%22bill.first%22+AND+Id_p%3D%22frnt-lt%22%7C%7Cas.made&collection=OQPC.fragment&subset=search').text)['frag.html'], 'lxml') + title_val = html_data.text.replace('\n', '').replace('\t', '') + return(title_val) + except: + return '' + + @property + def bill_type(self): + try: + div = self.bill_history_soup.find('div', {'id': 'parsewrapper'}) + table = div.find('table', {'class': 'table table-striped'}) + _type = table.find('tr').text.replace('\n', '') + return(_type) + except: + return '' + + @property + def explanatory_note(self): + try: + div = self.bill_history_soup.find('div', {'id': 'parsewrapper'}) + table = div.find('table', {'class': 'table table-striped'}) + td = table.find_all('tr')[1].find_all('td')[1] + for paragraph in td.find_all('a'): + if 'Explanatory Note' in paragraph.text.replace('\n', ' '): + return('https://www.legislation.qld.gov.au' + paragraph['href']) + return '' + except: + return '' + + @property + def data(self): + self._bill_data[EXPLANATORY_NOTE] = self.explanatory_note + self._bill_data[SPONSOR] = self.sponsor + self._bill_data[RENDITIONS] = self.renditions + self._bill_data[LONG_TITLE] = self.renditions + self._bill_data[BILL_TYPE] = self.bill_type + return(self._bill_data) \ No newline at end of file diff --git a/ausbills/sa_parliament.py b/ausbills/sa_parliament.py new file mode 100644 index 0000000..e339a93 --- /dev/null +++ b/ausbills/sa_parliament.py @@ -0,0 +1,95 @@ +from requests import get +from bs4 import BeautifulSoup + +bill_list_urls = ['https://legislation.sa.gov.au/listBills.aspx?key=', 'https://legislation.sa.gov.au/listAZBills.aspx?key='] +sa_base_url = 'https://legislation.sa.gov.au/' + +URL = 'url' +SHORT_TITLE = 'short_title' +SPONSOR = 'sponsor' +TEXTS = 'texts' + +class sa_All_Bills(object): + _bills_data = [] + + def __init__(self): + try: + self.create_dataset() + except: + raise Exception('An error ocurred when trying to scrape bills...') + + def create_dataset(self): + _bill_titles = [] + _bill_urls = [] + for list_url in bill_list_urls: + table = BeautifulSoup(get(list_url).text, 'lxml').find('table', {'summary': 'A List of the various versions of this Bills beginning with this letter'}).find('tbody') + for row in table.find_all('tr'): + _bill_urls.append(sa_base_url + row.find('a')['href'].replace(' ', '%20')) + _bill_titles.append(row.find('a').text.replace('\n', '').replace('\r', ' ').replace('\xa0', ' ').replace(' ', ' ')) + for bill in range(len(_bill_titles)): + if('—introduced by' in _bill_titles[bill]): + title_split = _bill_titles[bill].split('—introduced by') + bill_dict = {URL: _bill_urls[bill], SHORT_TITLE: title_split[0], SPONSOR: title_split[1]} + self._bills_data.append(bill_dict) + else: + bill_dict = {URL: _bill_urls[bill], SHORT_TITLE: _bill_titles[bill], SPONSOR: ''} + self._bills_data.append(bill_dict) + + @property + def data(self): + return(self._bills_data) + +sa_all_bills = sa_All_Bills().data + +class sa_Bill(object): + _all_bills = sa_all_bills + + def __init__(self, input): + if(isinstance(input, dict)): + try: + self.create_vars(input) + except Exception as e: + raise ValueError('Dict must have correct keys, missing key ' + e) + else: + raise ValueError('Input must be valid sa_Bill dict data...') + + def create_vars(self, init_data): + self._bill_data = init_data + self.url = init_data[URL] + self.short_title = init_data[SHORT_TITLE] + try: + self.bill_soup = BeautifulSoup(get(self.url).text, 'lxml') + except: + raise Exception('Unable to scrape ' + self.url) + + @property + def sponsor(self): + if(self._bill_data[SPONSOR] == ''): + try: + text = self.bill_soup.find('div', {'class': 'ItemIntroducedBy'}).find('p').text + return(text) + except: + return '' + else: + return(self._bill_data[SPONSOR][1:]) + + @property + def texts(self): + try: + data_list = [] + table_body = self.bill_soup.find('table', {'summary': 'A List of the various stages of this Bill'}).find('tbody') + links = table_body.find_all('a', {'title': 'View document in PDF in new window'}) + for link in links: + data_url = sa_base_url + link['href'].replace(' ', '%20') + data_text = link.parent.findPrevious('td').text + data_dict = {data_text: data_url} + data_list.append(data_dict) + return(data_list) + except: + return [] + + @property + def data(self): + self._bill_data[TEXTS] = self.texts + self._bill_data[SPONSOR] = self.sponsor + return(self._bill_data) \ No newline at end of file diff --git a/ausbills/tas_parliament.py b/ausbills/tas_parliament.py new file mode 100644 index 0000000..858934d --- /dev/null +++ b/ausbills/tas_parliament.py @@ -0,0 +1,207 @@ +from datetime import datetime +from requests import get +from bs4 import BeautifulSoup + +current_year = datetime.today().year +url_split = ['https://www.parliament.tas.gov.au/bills/Bills', '/BillsWeb', '.htm'] + +URL = 'url' +TITLE = 'title' +YEAR = 'year' +PASSED_LOWER = 'passed_lower' +PASSED_UPPER = 'passed_upper' +SPONSOR = 'sponsor' +BILL_TEXT_URL = 'bill_text_url' +WAS_AMENDED_UPPER = 'was_amended_upper' +WAS_AMENDED_LOWER = 'was_amended_lower' +ASSENTED = 'assented' +ACT_NO = 'act_no' +LOWER_FIRST_READING = 'lower_first_reading' +LOWER_SECOND_READING = 'lower_second_reading' +LOWER_THIRD_READING = 'lower_third_reading' +UPPER_FIRST_READING = 'upper_first_reading' +UPPER_SECOND_READING = 'upper_second_reading' +UPPER_THIRD_READING = 'upper_third_reading' + +class tas_All_Bills(object): + _bills_data = [] + + def __init__(self): + try: + self.create_dataset() + except: + raise Exception('Error when scraping bills...') + + def create_dataset(self): + for year in range(current_year - (current_year - 2002), current_year + 1): + soup = BeautifulSoup(get(url_split[0] + str(year) + url_split[1] + str(year) + url_split[2]).text, 'lxml') + table = soup.find('table', {'bordercolor': '#CCCCCC'}) + bills = table.find_all('a') + _bill_urls = [] + _bill_titles = [] + for bill in bills: + _bill_titles.append(bill.text.strip()) + _bill_urls.append(url_split[0] + str(year) + '/' + bill['href']) + for bill in range(len(_bill_titles)): + bill_dict = {URL: _bill_urls[bill], TITLE: _bill_titles[bill], YEAR: str(year)} + self._bills_data.append(bill_dict) + + @property + def data(self): + return(self._bills_data) + +tas_all_bills = tas_All_Bills().data + +class tas_Bill(object): + _all_bills = tas_all_bills + + def __init__(self, input): + if(isinstance(input, dict)): + try: + self.create_vars(input) + except Exception as e: + raise Exception('Dict must have correct keys, missing key ' + e) + else: + raise ValueError('Input data must be valid tas_Bill dict data...') + + def create_vars(self, init_data): + self._bill_data = init_data + self.url = init_data[URL] + self.title = init_data[TITLE] + try: + self.bill_soup = BeautifulSoup(get(self.url).text, 'lxml') + table = self.bill_soup.find('table', {'bordercolor': '#CCCCCC'}) + self._rows = table.find_all('tr') + except: + raise Exception('Unable to scrape ' + self.url) + self.get_first_readings() + self.get_second_readings() + self.get_third_readings() + + @property + def sponsor(self): + rows = self._rows + try: + return(rows[0].find('td', {'colspan': '4'}).text.replace(' Introduced by: ', '').strip()) + except: + return '' + + @property + def bill_text_url(self): + try: + return(url_split[0] + self._bill_data[YEAR] + '/' + self._rows[1].find('a')['href']) + except: + return '' + + @property + def was_amended_upper(self): + return(self.amended_check()[1]) + + @property + def was_amended_lower(self): + return(self.amended_check()[0]) + + @property + def passed_lower(self): + column = self._rows[12:][0].find('td') + try: + return(self.format_date(column.find('br').previousSibling.strip().replace('HA Agreed: ', ''))) + except: + return False + + @property + def passed_upper(self): + column = self._rows[12:][0].find('td') + try: + return(self.format_date(column.find('br').nextSibling.strip().replace('Agreed Both: ', ''))) + except: + return False + + @property + def assented(self): + column = self._rows[12:][0].find('td') + try: + return(self.format_date(column.find_all('br', limit=2)[-1].nextSibling.strip().replace('Royal Assent: ', ''))) + except: + return False + + @property + def act_no(self): + column = self._rows[12:][0].find('td') + try: + return(column.find_all('br', limit=4)[-1].nextSibling.replace('Act Number:', '').strip()) + except: + return '' + + def amended_check(self): + columns = self._rows[9:][0].find_all('td') + lower = columns[1].text.strip() + upper = columns[3].text.strip() + if(lower == 'Yes'): + lower = True + else: + lower = False + + if(upper == 'Yes'): + upper = True + else: + upper = False + return[lower, upper] + + def get_first_readings(self): + columns = self._rows[3:][0].find_all('td') + try: + self.lower_first_reading = self.format_date(columns[1].text.strip()) + except: + self.lower_first_reading = '' + + try: + self.upper_first_reading = self.format_date(columns[3].text.strip()) + except: + self.upper_first_reading = '' + + def get_second_readings(self): + columns = self._rows[5:][0].find_all('td') + try: + self.lower_second_reading = self.format_date(columns[1].text.strip()) + except: + self.lower_second_reading = '' + + try: + self.upper_second_reading = self.format_date(columns[3].text.strip()) + except: + self.upper_second_reading = '' + + def get_third_readings(self): + columns = self._rows[10:][0].find_all('td') + try: + self.lower_third_reading = self.format_date(columns[1].text.strip()) + except: + self.lower_third_reading = '' + + try: + self.upper_third_reading = self.format_date(columns[3].text.strip()) + except: + self.upper_third_reading = '' + + def format_date(self, input_date): + dateSplit = input_date.split('/', 2) + return("{:0>2s}".format(dateSplit[0]) + '-' "{:0>2s}".format(dateSplit[1]) + '-' + dateSplit[2]) + + @property + def data(self): + self._bill_data[PASSED_LOWER] = self.passed_lower + self._bill_data[PASSED_UPPER] = self.passed_upper + self._bill_data[LOWER_FIRST_READING] = self.lower_first_reading + self._bill_data[LOWER_SECOND_READING] = self.lower_second_reading + self._bill_data[LOWER_THIRD_READING] = self.lower_third_reading + self._bill_data[UPPER_FIRST_READING] = self.upper_first_reading + self._bill_data[UPPER_SECOND_READING] = self.upper_second_reading + self._bill_data[UPPER_THIRD_READING] = self.upper_third_reading + self._bill_data[SPONSOR] = self.sponsor + self._bill_data[BILL_TEXT_URL] = self.bill_text_url + self._bill_data[ASSENTED] = self.assented + self._bill_data[WAS_AMENDED_LOWER] = self.was_amended_lower + self._bill_data[WAS_AMENDED_UPPER] = self.was_amended_upper + self._bill_data[ACT_NO] = self.act_no + return(self._bill_data) \ No newline at end of file