Skip to content

Commit

Permalink
UT: partial rewrite to support 2025
Browse files Browse the repository at this point in the history
  • Loading branch information
jessemortenson committed Dec 14, 2024
1 parent 11f7a64 commit c0dc1b1
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 46 deletions.
14 changes: 11 additions & 3 deletions scrapers/ut/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ class Utah(State):
"name": "2024 General Session",
"start_date": "2024-01-16",
"end_date": "2024-03-01",
"active": True,
"active": False,
},
{
"_scraped_name": "2024 3rd Special Session",
Expand All @@ -355,7 +355,7 @@ class Utah(State):
"start_date": "2024-06-19",
# TODO: update end_date
"end_date": "2024-06-20",
"active": True,
"active": False,
},
{
"_scraped_name": "2024 4th Special Session",
Expand All @@ -365,11 +365,19 @@ class Utah(State):
"start_date": "2024-08-21",
# TODO: update end_date
"end_date": "2024-08-30",
"active": False,
},
{
"_scraped_name": "2025 General Session",
"classification": "primary",
"identifier": "2025",
"name": "2025 General Session",
"start_date": "2025-01-21",
"end_date": "2025-03-17",
"active": True,
},
]
ignored_scraped_sessions = [
"2025 General Session",
"2023 1st House Extraordinary Session",
"2013 1st House Session",
"2011 Veto Override Session",
Expand Down
208 changes: 165 additions & 43 deletions scrapers/ut/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
from .actions import Categorizer
from utils import LXMLMixin

import json
import lxml.html
from lxml.etree import ParserError
import pytz
import scrapelib

SUB_BLACKLIST = [
Expand All @@ -22,9 +25,15 @@

SPECIAL_SLUGS = {"2021S1H": "2021Y1", "2021S1S": "2021X1"}

SPONSOR_HOUSE_TO_CHAMBER = {
"H": "lower",
"S": "upper",
}


class UTBillScraper(Scraper, LXMLMixin):
categorizer = Categorizer()
_TZ = pytz.timezone("America/Denver")

def scrape(self, session=None, chamber=None):
# if you need to test on an individual bill...
Expand All @@ -42,48 +51,51 @@ def scrape(self, session=None, chamber=None):
else:
session_slug = "{}GS".format(session)

session_url = "https://le.utah.gov/DynaBill/BillList?session={}".format(
session_slug
)
session_url = "https://le.utah.gov/billlist.jsp?session={}".format(session_slug)

# For some sessions the link doesn't go straight to the bill list
doc = self.lxmlize(session_url)
replacement_session_url = doc.xpath(
'//a[text()="Numbered Bills" and contains'
'(@href, "DynaBill/BillList")]/@href'
)
if replacement_session_url:
(session_url,) = replacement_session_url

# Identify all the bill lists linked from a given session's page
bill_indices = [
re.sub(r"^r", "", x)
for x in self.lxmlize(session_url).xpath('//div[contains(@id, "0")]/@id')
]

# Capture the bills from each of the bill lists
for bill_index in bill_indices:
if bill_index.startswith("H"):
chamber = "lower"
elif bill_index.startswith("S"):
chamber = "upper"
# Get all of the show/hide bill list elements
# in order to get the IDs of the actual bill lists
bill_list_ids = []
show_hide_elems = doc.cssselect("a.mitem")
js_id_getter = re.compile("javascript:toggleObj\('([^']+)'\)")
for elem in show_hide_elems:
list_id_match = js_id_getter.match(elem.get("href"))
if list_id_match:
bill_list_ids.append(list_id_match.group(1))
else:
raise AssertionError("Unknown bill type found: {}".format(bill_index))

bill_index = self.lxmlize(session_url + "&bills=" + bill_index)
bills = bill_index.xpath('//a[contains(@href, "/bills/static/")]')

for bill in bills:
yield from self.scrape_bill(
chamber=chamber,
session=session,
bill_id=bill.xpath("text()")[0],
url=bill.xpath("@href")[0],
self.logger.error(
"Failed to find bill list ID out of JS show/hide elem"
)

def scrape_bill(self, chamber, session, bill_id, url):
# Capture the bills from each of the bill lists
for list_id in bill_list_ids:
bill_link_containers = doc.cssselect(f"#{list_id}")
for container in bill_link_containers:
for bill_link in container.cssselect("a"):
if bill_link.text.startswith("H"):
chamber = "lower"
elif bill_link.text.startswith("S"):
chamber = "upper"
else:
raise AssertionError(
"Unknown bill type found: {}".format(bill_link.text)
)

yield from self.scrape_bill(
chamber=chamber,
session=session,
url=bill_link.get("href"),
session_slug=session_slug,
)

def scrape_bill(self, chamber, session, url, session_slug):
page = self.lxmlize(url)

bill_id = page.cssselect("#breadcrumb li")[-1].text

(header,) = page.xpath('//h3[@class="heading"]/text()')
title = header.replace(bill_id, "").strip()

Expand Down Expand Up @@ -111,6 +123,18 @@ def scrape_bill(self, chamber, session, bill_id, url):
bill.add_source(url)

primary_info = page.xpath('//div[@id="billsponsordiv"]')
if len(primary_info) == 0:
# starting 2025 seems UT is rendering bill data from an API/JSON
# but prior years seem to have static-ish HTML
# so we have two logic branches here
# TODO vote processing - need to see what data looks like
self.scrape_bill_details_from_api(bill, url, session_slug)
else:
yield from self.parse_bill_details_from_html(bill, bill_id, chamber, page, primary_info)

yield bill

def parse_bill_details_from_html(self, bill, bill_id, chamber, page, primary_info):
for info in primary_info:
try:
(title, name) = [
Expand Down Expand Up @@ -140,12 +164,10 @@ def scrape_bill(self, chamber, session, bill_id, url):
)
else:
self.warning("Unexpected floor sponsor HTML found")

versions = page.xpath(
'//b[text()="Bill Text"]/following-sibling::ul/li/'
'a[text() and not(text()=" ")]'
)

for version in versions:

# sometimes the href is on the following <a> tag and the tag we
Expand All @@ -155,12 +177,13 @@ def scrape_bill(self, chamber, session, bill_id, url):
url = version.xpath("following-sibling::a[1]/@href")[0]

bill.add_version_link(
version.xpath("text()")[0].strip(), url, media_type="application/pdf"
version.xpath("text()")[0].strip(),
url,
media_type="application/pdf",
)

for related in page.xpath(
'//b[text()="Related Documents "]/following-sibling::ul/li/'
'a[contains(@class,"nlink")]'
'//b[text()="Related Documents "]/following-sibling::ul/li/'
'a[contains(@class,"nlink")]'
):
href = related.xpath("@href")[0]
if ".fn.pdf" in href:
Expand All @@ -170,17 +193,112 @@ def scrape_bill(self, chamber, session, bill_id, url):
else:
text = related.xpath("text()")[0]
bill.add_document_link(text, href, media_type="application/pdf")

subjects = []
for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
subjects.append(link.text.strip())
bill.subject = subjects

if page.xpath('//div[@id="billStatus"]//table'):
status_table = page.xpath('//div[@id="billStatus"]//table')[0]
yield from self.parse_status(bill, status_table, chamber)

yield bill
def scrape_bill_details_from_api(self, bill: Bill, bill_url, session_slug: str):
# get bill "filename" from bill_url
bill_filename = bill_url.split("/")[-1].split(".")[0]
# use datetime to generate a unix epoch timestamp representing now
# UT seems to do this in milliseconds
now = int(datetime.datetime.now().timestamp() * 1000)
api_url = (
f"https://le.utah.gov/data/{session_slug}/{bill_filename}.json?_={now}"
)
response = self.get(api_url)
data = json.loads(response.content)

# Sponsorships
sponsor_name = data["primeSponsorName"]
sponsor_name = sponsor_name.replace("Sen. ", "").replace("Rep. ", "")
sponsor_chamber = SPONSOR_HOUSE_TO_CHAMBER[data["primeSponsorHouse"]]
bill.add_sponsorship(
sponsor_name,
classification="primary",
entity_type="person",
primary=True,
chamber=sponsor_chamber,
)
if data["floorSponsor"]:
floor_sponsor_name = data["floorSponsorName"]
floor_sponsor_name = floor_sponsor_name.replace("Sen. ", "").replace("Rep. ", "")
floor_sponsor_chamber = SPONSOR_HOUSE_TO_CHAMBER[data["floorSponsorHouse"]]
bill.add_sponsorship(
floor_sponsor_name,
classification="cosponsor",
entity_type="person",
primary=False,
chamber=floor_sponsor_chamber,
)

# Versions, subjects, code citations
subjects = set()
# citations = set()
for version_data in data["billVersionList"]:
# subjects associated with each version, so dedupe
for subject_data in version_data["subjectList"]:
subjects.add(subject_data["description"])

# TODO finish citations work
# citations associated with each version, dedupe again
# for citation_data in version_data["sectionAffectedList"]:
# citations.add(citation_data["secNo"])

for doc_data in version_data["billDocs"]:
# Not really sure what's going to be in this array
# just versions? other documents?
# so throw something here if we find surprise
# and improve scraper later
if doc_data["fileName"] != f"{bill_filename}.xml":
self.error(f"Found unexplored bill version data at {api_url}")

# There seem to be XML and PDF files on Utah server
# the UT bill details page seems to have code to
# display the XML as HTML inline
bill.add_version_link(
doc_data["shortDesc"],
f"https://le.utah.gov{doc_data['url']}",
media_type="text/xml"
)
pdf_filepath = doc_data['url'].replace(".xml", ".pdf")
bill.add_version_link(
doc_data["shortDesc"],
f"https://le.utah.gov{pdf_filepath}",
media_type="application/pdf"
)

for subject in subjects:
bill.add_subject(subject)

# TODO finish citations work
# for citation in citations:
# bill.add_citation(citation)

# Actions
for action_data in data["actionHistoryList"]:
categorizer_result = self.categorizer.categorize(action_data["description"])
actor = "legislature"
if action_data["owner"] == "Legislative Research and General Counsel":
actor = "legislature"
elif "governor" in action_data["owner"].lower():
actor = "executive"
else:
self.error(f"Found unexpected actor {action_data['owner']} at {api_url}")

date = datetime.datetime.strptime(action_data["actionDate"], "%m/%d/%Y").date()
date = date.strftime("%Y-%m-%d")

bill.add_action(
date=date,
description=action_data["description"],
classification=categorizer_result["classification"],
chamber=actor,
)

def parse_status(self, bill, status_table, chamber):
page = status_table
Expand Down Expand Up @@ -303,7 +421,11 @@ def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
except scrapelib.HTTPError:
self.warning("A vote page not found for bill {}".format(bill.identifier))
return
page = lxml.html.fromstring(page)
try:
page = lxml.html.fromstring(page)
except ParserError:
self.logger.warning(f"Could not parse HTML vote page {url}")

page.make_links_absolute(url)
descr = page.xpath("//b")[0].text_content()
if descr == "":
Expand Down

0 comments on commit c0dc1b1

Please sign in to comment.