Improve EPSS collector memory usage (#360)

RedHatProductSecurity · Oct 23, 2023 · f3f8a17 · f3f8a17
2 parents 90b5a68 + 6ca8291
commit f3f8a17
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 13 deletions.
diff --git a/apps/exploits/tests/test_api.py b/apps/exploits/tests/test_api.py
@@ -23,8 +23,7 @@
 def load_data():
     # Import inside test to avoid using database before
     # enable_db_access_for_all_tests fixture is ready
-    from collectors.epss.tasks import process_data as process_data_epss
-    from collectors.epss.tasks import store_objects
+    from collectors.epss.tasks import process_and_store
     from collectors.exploits_cisa.tasks import process_data
 
     f1 = FlawFactory(
@@ -93,8 +92,7 @@ def load_data():
         objects = process_data(json.load(f))
     store_or_update_exploits(objects)
 
-    objects = process_data_epss(EPSS_TEST_FILE)
-    store_objects(objects)
+    process_and_store(EPSS_TEST_FILE)
 
 
 class TestAPI(object):

diff --git a/apps/exploits/tests/test_parsing.py b/apps/exploits/tests/test_parsing.py
@@ -108,12 +108,11 @@ def test_parsing_exploit_db(self):
     def test_parsing_epss(self):
         # Import inside test to avoid using database before
         # enable_db_access_for_all_tests fixture is ready
-        from collectors.epss.tasks import process_data, store_objects
+        from collectors.epss.tasks import process_and_store
 
         FlawFactory(cve_id="CVE-2222-0001")
         FlawFactory(cve_id="CVE-2222-0002")
-        objects = process_data(EPSS_TEST_FILE)
-        store_objects(objects)
+        process_and_store(EPSS_TEST_FILE)
 
         assert EPSS.objects.count() == 3
 

diff --git a/collectors/epss/tasks.py b/collectors/epss/tasks.py
@@ -23,6 +23,7 @@
 COLUMN_CVE = 0
 COLUMN_EPSS = 1
 EPSS_URL = "https://epss.cyentia.com/epss_scores-current.csv.gz"
+CHUNK_SIZE = 1000
 
 
 def download():
@@ -37,7 +38,7 @@ def process_data(compressed_file):
     reader = csv.reader(csv_data.split("\n"))
     rows = [r for r in reader if r][2:]  # Remove header and empty lines
 
-    for row in rows:
+    for i, row in enumerate(rows, start=1):
         cve = row[COLUMN_CVE]
         epss = float(row[COLUMN_EPSS])
         epss_objects.append(
@@ -47,22 +48,27 @@ def process_data(compressed_file):
                 epss=epss,
             )
         )
+        if i % CHUNK_SIZE == 0:
+            update_objects_with_flaws(epss_objects)  # Make links to flaws if they exist
+            yield epss_objects
+            epss_objects = []
 
     update_objects_with_flaws(epss_objects)  # Make links to flaws if they exist
-    return epss_objects
+    yield epss_objects
 
 
-def store_objects(epss_objects):
+def process_and_store(data):
+    # Do not use store_objects as it cannot handle chunks
     with transaction.atomic():  # Avoid having empty table accessible
         EPSS.objects.all().delete()  # Always load the data again, as it changes in time
-        EPSS.objects.bulk_create(epss_objects)
+        for objects in process_data(data):
+            EPSS.objects.bulk_create(objects)
 
 
 def epss_collector_main():
     set_exploit_collector_acls()
     data = download()
-    objects = process_data(data)
-    store_objects(objects)
+    process_and_store(data)
 
 
 @collector(

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   /collectors/api/v1/status endpoint
 - fix PS contact model (OSIDB-1445)
 - fix schema to reflect Erratum shipped_dt to be nullable
+- Improve EPSS collector memory consumption
 
 ## [3.5.0] - 2023-10-09
 ### Added