Merge pull request #328 from specifysystems/291-split-occurrences-too…

…l-should-optionally-output-a-species-list Add output species list to occ splitter
specifysystems · May 23, 2022 · 945eaa6 · 945eaa6
2 parents 4e05a7f + 1a43214
commit 945eaa6
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 0 deletions.
diff --git a/lmpy/data_preparation/occurrence_splitter.py b/lmpy/data_preparation/occurrence_splitter.py
@@ -97,6 +97,7 @@ def __init__(
         self.writer_fields = write_fields
         self.max_writers = max_writers
         self.writers = {}
+        self.seen_taxa = set()
 
     # .......................
     def __enter__(self):
@@ -176,12 +177,24 @@ def write_points(self, points):
         """
         if points:
             writer_key = self.get_writer_key(points[0])
+            self.seen_taxa.add(points[0].species_name)
             if writer_key not in self.writers.keys():
                 if self.writer_fields is None:
                     self.writer_fields = list(points[0].attributes.keys())
                 self.open_writer(writer_key)
             self.writers[writer_key].write_points(points)
 
+    # .......................
+    def write_species_list(self, species_list_filename):
+        """Write a species list of species seen (after wrangling).
+
+        Args:
+            species_list_filename (str): File location to write the species list.
+        """
+        with open(species_list_filename, mode='wt') as species_out:
+            for sp in list(self.seen_taxa):
+                species_out.write(f'{sp}\n')
+
 
 # .....................................................................................
 __all__ = [

diff --git a/lmpy/tools/split_occurrence_data.py b/lmpy/tools/split_occurrence_data.py
@@ -83,6 +83,11 @@ def build_parser():
             'a species header key, an x header key, and a y header key.'
         ),
     )
+    parser.add_argument(
+        '--species_list_filename',
+        type=str,
+        help='File location to write list of species seen (after wrangling).'
+    )
 
     parser.add_argument(
         'out_dir',
@@ -131,6 +136,8 @@ def cli():
                 with open(wranglers_fn, mode='rt') as in_json:
                     wranglers = wrangler_factory.get_wranglers(json.load(in_json))
                 occurrence_processor.process_reader(reader, wranglers)
+        if args.species_list_filename:
+            occurrence_processor.write_species_list(args.species_list_filename)
 
 
 # .....................................................................................

diff --git a/tests/test_data_preparation/test_occurrence_splitter.py b/tests/test_data_preparation/test_occurrence_splitter.py
@@ -50,6 +50,7 @@ def test_one_dwca(monkeypatch, generate_temp_filename, temp_directory):
     """
     # Temporary files
     dwca_filename = generate_temp_filename()
+    species_list_filename = generate_temp_filename(suffix='.txt')
 
     # Generate a DWCA and wranglers
     dwca_fields = [
@@ -91,12 +92,17 @@ def test_one_dwca(monkeypatch, generate_temp_filename, temp_directory):
     splitter.process_reader(
         PointDwcaReader(dwca_filename), factory.get_wranglers(wrangler_config)
     )
+    splitter.write_species_list(species_list_filename)
     splitter.close()
 
     # Check output
     assert validate_point_csvs(
         glob.glob(f'{temp_directory}/*.csv'), 'taxonname', 'longitude', 'latitude'
     )
+    # Check that species in species list are accepted
+    with open(species_list_filename, mode='rt') as species_in:
+        for line in species_in:
+            assert line.strip() in list(SPECIES_MAP.values())
 
 
 # .....................................................................................
@@ -179,6 +185,7 @@ def test_complex(monkeypatch, generate_temp_filename, temp_directory):
     dwca_2_filename = generate_temp_filename()
     csv_1_filename = generate_temp_filename()
     csv_2_filename = generate_temp_filename()
+    species_list_filename = generate_temp_filename(suffix='.txt')
 
     # Reader and wrangler configurations
     # DWCA 1
@@ -413,8 +420,13 @@ def test_complex(monkeypatch, generate_temp_filename, temp_directory):
             PointCsvReader(csv_2_filename, 'taxname', 'dec_lon', 'dec_lat'),
             factory.get_wranglers(csv_2_wrangler_conf)
         )
+        splitter.write_species_list(species_list_filename)
 
     # Check output
     assert validate_point_csvs(
         glob.glob(f'{temp_directory}/*.csv'), 'species', 'longitude', 'latitude'
     )
+    # Check that species in species list are accepted
+    with open(species_list_filename, mode='rt') as species_in:
+        for line in species_in:
+            assert line.strip() in list(SPECIES_MAP.values())
diff --git a/tests/test_tools/test_split_occurrence_data.py b/tests/test_tools/test_split_occurrence_data.py
@@ -274,6 +274,8 @@ def test_complex(monkeypatch, generate_temp_filename, temp_directory):
     wrangler_3_filename = generate_temp_filename()
     wrangler_4_filename = generate_temp_filename()
 
+    species_list_filename = generate_temp_filename(suffix='.txt')
+
     # Reader and wrangler configurations
     # DWCA 1
     dwca_1_fields = [
@@ -518,6 +520,8 @@ def test_complex(monkeypatch, generate_temp_filename, temp_directory):
         'taxname',
         'dec_lon',
         'dec_lat',
+        '--species_list_filename',
+        species_list_filename,
         temp_directory
     ]
 
@@ -528,3 +532,7 @@ def test_complex(monkeypatch, generate_temp_filename, temp_directory):
     assert validate_point_csvs(
         glob.glob(f'{temp_directory}/*.csv'), 'species_name', 'x', 'y'
     )
+    # Check that species in species list are accepted
+    with open(species_list_filename, mode='rt') as species_in:
+        for line in species_in:
+            assert line.strip() in list(SPECIES_MAP.values())