From 33c1063ede2470fa5471649b78f5a6d0254cea0f Mon Sep 17 00:00:00 2001 From: David Cain Date: Mon, 24 Jun 2024 09:08:41 -0600 Subject: [PATCH] Fix a bug from trying to scrape all trips I found a scraper (hello, Scrapy user!) trying to scrape all trips from https://mitoc-trips.mit.edu/trips/?after=0001-10-17 The server crashed because I mistakenly assumed you can subtract 365 days from any given year. Let's not do that. Separately, with *thousands* of trips, this view is getting pretty slow. To monitor (and perhaps limit) scrapers, I might try to make the view a bit more locked down. I don't *want* to paginate, but I don't want abuse either. --- ws/tests/views/test_trips.py | 11 +++++++++ ws/views/trips.py | 45 ++++++++++++++++++++++++------------ 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/ws/tests/views/test_trips.py b/ws/tests/views/test_trips.py index 428076b0..66aac398 100644 --- a/ws/tests/views/test_trips.py +++ b/ws/tests/views/test_trips.py @@ -126,6 +126,17 @@ def test_trips_with_filter(self): self._expect_past_trips(response, [one_week_ago.pk, one_month_ago.pk]) self._expect_link_for_date(soup, "2016-11-15") + def test_trips_with_very_early_date(self): + """You can ask for trips starting after the year 1.""" + trip = factories.TripFactory.create(trip_date="2016-12-23") + + # Filter based on a date in the past + response, soup = self._get("/trips/?after=0001-10-17") + self.assertFalse(response.context["date_invalid"]) + + self._expect_title(soup, "Trips after 1900-01-01") + self._expect_past_trips(response, [trip.pk]) + def test_upcoming_trips_can_be_filtered(self): """If supplying an 'after' date in the future, that still permits filtering!""" _next_week = factories.TripFactory.create(trip_date="2019-02-22") diff --git a/ws/views/trips.py b/ws/views/trips.py index 1669a93a..b392794c 100644 --- a/ws/views/trips.py +++ b/ws/views/trips.py @@ -49,6 +49,11 @@ from ws.middleware import RequestWithParticipant +# Hardcode the result of `models.Trip.objects.earliest('trip_date') +# There will never be any *older* trips, so we can save db queries. +FIRST_TRIP_DATE = date(2015, 1, 10) + + class TripView(DetailView): """Display the trip to both unregistered users and known participants. @@ -487,25 +492,30 @@ def get_queryset(self): trips = super().get_queryset() return annotated_for_trip_list(trips) - def _optionally_filter_from_args(self): + def _optionally_filter_from_args(self) -> tuple[date | None, bool]: """Return the date at which we want to omit previous trips, plus validity boolean. If the user passes an invalid date (for example, because they were manually building the query arguments), we don't want to 500 and instead should just give them a simple warning message. """ - start_date = None - start_date_invalid = False - if "after" in self.request.GET: - after = self.request.GET["after"] - try: - start_date = date.fromisoformat(after) - except (TypeError, ValueError): - start_date_invalid = True - else: - start_date_invalid = False + if "after" not in self.request.GET: + # No filtering, which is obviously valid + return None, False + + after = self.request.GET["after"] + try: + start_date = date.fromisoformat(after) + except (TypeError, ValueError): + return None, True - return (start_date, start_date_invalid) + return ( + # Guard against a weird edge case -- we can accept the year 1! + # We'll try to *subtract* time, and end up an exception rather than BCE datetimes + # There were no trips in our database before the 2010s anyway. + max(start_date, date(1900, 1, 1)), + False, + ) def get_context_data(self, **kwargs): """Sort trips into past and present trips.""" @@ -519,9 +529,14 @@ def get_context_data(self, **kwargs): context["on_or_after_date"] = on_or_after_date trips = trips.filter(trip_date__gte=on_or_after_date) - # Get approximately one year prior for use in paginating back in time. - # (need not be exact/handle leap years) - context["one_year_prior"] = (on_or_after_date or today) - timedelta(days=365) + oldest_trip_date = on_or_after_date or today + # Prevent rendering "Previous trips" if we know there won't be any. + # Also prevents a weird edge case -- you can pass a date in the year 1. + # (When the server tries to subtract 365 days, Python declines to make BCE dates) + if oldest_trip_date >= FIRST_TRIP_DATE: + # Get approximately one year prior for use in paginating back in time. + # (need not be exact/handle leap years) + context["one_year_prior"] = oldest_trip_date - timedelta(days=365) # By default, just show upcoming trips. context["current_trips"] = trips.filter(trip_date__gte=today)