Skip to content

Commit

Permalink
Improve EPSS collector memory usage (#360)
Browse files Browse the repository at this point in the history
  • Loading branch information
skontar authored Oct 23, 2023
2 parents 90b5a68 + 6ca8291 commit f3f8a17
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 13 deletions.
6 changes: 2 additions & 4 deletions apps/exploits/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
def load_data():
# Import inside test to avoid using database before
# enable_db_access_for_all_tests fixture is ready
from collectors.epss.tasks import process_data as process_data_epss
from collectors.epss.tasks import store_objects
from collectors.epss.tasks import process_and_store
from collectors.exploits_cisa.tasks import process_data

f1 = FlawFactory(
Expand Down Expand Up @@ -93,8 +92,7 @@ def load_data():
objects = process_data(json.load(f))
store_or_update_exploits(objects)

objects = process_data_epss(EPSS_TEST_FILE)
store_objects(objects)
process_and_store(EPSS_TEST_FILE)


class TestAPI(object):
Expand Down
5 changes: 2 additions & 3 deletions apps/exploits/tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,11 @@ def test_parsing_exploit_db(self):
def test_parsing_epss(self):
# Import inside test to avoid using database before
# enable_db_access_for_all_tests fixture is ready
from collectors.epss.tasks import process_data, store_objects
from collectors.epss.tasks import process_and_store

FlawFactory(cve_id="CVE-2222-0001")
FlawFactory(cve_id="CVE-2222-0002")
objects = process_data(EPSS_TEST_FILE)
store_objects(objects)
process_and_store(EPSS_TEST_FILE)

assert EPSS.objects.count() == 3

Expand Down
18 changes: 12 additions & 6 deletions collectors/epss/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
COLUMN_CVE = 0
COLUMN_EPSS = 1
EPSS_URL = "https://epss.cyentia.com/epss_scores-current.csv.gz"
CHUNK_SIZE = 1000


def download():
Expand All @@ -37,7 +38,7 @@ def process_data(compressed_file):
reader = csv.reader(csv_data.split("\n"))
rows = [r for r in reader if r][2:] # Remove header and empty lines

for row in rows:
for i, row in enumerate(rows, start=1):
cve = row[COLUMN_CVE]
epss = float(row[COLUMN_EPSS])
epss_objects.append(
Expand All @@ -47,22 +48,27 @@ def process_data(compressed_file):
epss=epss,
)
)
if i % CHUNK_SIZE == 0:
update_objects_with_flaws(epss_objects) # Make links to flaws if they exist
yield epss_objects
epss_objects = []

update_objects_with_flaws(epss_objects) # Make links to flaws if they exist
return epss_objects
yield epss_objects


def store_objects(epss_objects):
def process_and_store(data):
# Do not use store_objects as it cannot handle chunks
with transaction.atomic(): # Avoid having empty table accessible
EPSS.objects.all().delete() # Always load the data again, as it changes in time
EPSS.objects.bulk_create(epss_objects)
for objects in process_data(data):
EPSS.objects.bulk_create(objects)


def epss_collector_main():
set_exploit_collector_acls()
data = download()
objects = process_data(data)
store_objects(objects)
process_and_store(data)


@collector(
Expand Down
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
/collectors/api/v1/status endpoint
- fix PS contact model (OSIDB-1445)
- fix schema to reflect Erratum shipped_dt to be nullable
- Improve EPSS collector memory consumption

## [3.5.0] - 2023-10-09
### Added
Expand Down

0 comments on commit f3f8a17

Please sign in to comment.