Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dramatically cut RAM usage when performing typical filtering by ARN. #732

Merged
merged 4 commits into from
Mar 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions tests/record_sources/local_directory_record_source_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@


def test_load_gzipped_files_in_timeframe_from_dir():
records = LocalDirectoryRecordSource(cloudtrail_data_dir()).load_from_dir(
records = list(LocalDirectoryRecordSource(cloudtrail_data_dir()).load_from_dir(
datetime.datetime(2017, 12, 1, tzinfo=pytz.utc),
datetime.datetime(2017, 12, 12, tzinfo=pytz.utc))
datetime.datetime(2017, 12, 12, tzinfo=pytz.utc)))
assert records == [
Record("autoscaling.amazonaws.com", "DescribeLaunchConfigurations",
assumed_role_arn="arn:aws:iam::111111111111:role/someRole",
Expand All @@ -23,9 +23,9 @@ def test_load_gzipped_files_in_timeframe_from_dir():


def test_load_gzipped_files_including_those_that_were_delivered_only_an_hour_after_the_event_time_we_are_looking_for():
records = LocalDirectoryRecordSource(cloudtrail_data_dir()).load_from_dir(
records = list(LocalDirectoryRecordSource(cloudtrail_data_dir()).load_from_dir(
datetime.datetime(2017, 12, 11, 0, 0, tzinfo=pytz.utc),
datetime.datetime(2017, 12, 11, 14, 5, tzinfo=pytz.utc))
datetime.datetime(2017, 12, 11, 14, 5, tzinfo=pytz.utc)))
assert records == [
Record("autoscaling.amazonaws.com", "DescribeLaunchConfigurations",
assumed_role_arn="arn:aws:iam::111111111111:role/someRole",
Expand All @@ -38,8 +38,8 @@ def test_load_gzipped_files_including_those_that_were_delivered_only_an_hour_aft


def test_load_no_gzipped_files_outsite_timeframe_from_dir():
records = LocalDirectoryRecordSource(cloudtrail_data_dir()).load_from_dir(
records = list(LocalDirectoryRecordSource(cloudtrail_data_dir()).load_from_dir(
datetime.datetime(2016, 12, 1, tzinfo=pytz.utc),
datetime.datetime(2016, 12, 12, tzinfo=pytz.utc))
datetime.datetime(2016, 12, 12, tzinfo=pytz.utc)))
assert records == []

2 changes: 1 addition & 1 deletion trailscraper/cloudtrail.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def filter_records(records,
to_date=datetime.datetime.now(tz=pytz.utc)):
"""Filter records so they match the given condition"""
result = list(pipe(records, filterz(_by_timeframe(from_date, to_date)), filterz(_by_role_arns(arns_to_filter_for))))
if not result and records:
if not result:
logging.warning(ALL_RECORDS_FILTERED)

return result
5 changes: 1 addition & 4 deletions trailscraper/record_sources/cloudtrail_api_record_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ def load_from_api(self, from_date, to_date):
StartTime=from_date,
EndTime=to_date,
)
records = []
for response in response_iterator:
for event in response['Events']:
records.append(_parse_record(json.loads(event['CloudTrailEvent'])))

return records
yield _parse_record(json.loads(event['CloudTrailEvent']))
5 changes: 1 addition & 4 deletions trailscraper/record_sources/local_directory_record_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,9 @@ def _to_paths(triple):

def load_from_dir(self, from_date, to_date):
"""Loads all CloudTrail Records in a file"""
records = []
for logfile in self._valid_log_files():
if logfile.contains_events_for_timeframe(from_date, to_date):
records.extend(logfile.records())

return records
yield from logfile.records()

def last_event_timestamp_in_dir(self):
"""Return the timestamp of the most recent event in the given directory"""
Expand Down