Aggregate occurrence data testing

specifysystems · May 24, 2022 · 3a92489 · 3a92489
1 parent bbc4c4c
commit 3a92489
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 16 deletions.
diff --git a/lmpy/point.py b/lmpy/point.py
@@ -174,6 +174,7 @@ def __init__(
         y_field,
         geopoint=None,
         group_field='species_name',
+        encoding='utf8',
     ):
         """Constructor for a Point CSV retriever.
 
@@ -188,6 +189,7 @@ def __init__(
             geopoint (:obj:`str`): The field name of the column containing geopoint
                 data.
             group_field (:obj:`str`): The name of the field to use for grouping points.
+            encoding (str): The encoding to use when opening the file.
         """
         self.filename = filename
         self.file = None
@@ -199,6 +201,7 @@ def __init__(
         self.group_field = group_field
         self._next_points = []
         self._curr_val = None
+        self.encoding = encoding
 
     # .......................
     def __enter__(self):
@@ -275,7 +278,7 @@ def __next__(self):
     # .......................
     def open(self):
         """Open the file and initialize."""
-        self.file = open(self.filename, 'r')
+        self.file = open(self.filename, 'r', encoding=self.encoding)
         temp_lines = []
         try:
             for _ in range(3):
@@ -494,8 +497,10 @@ def __next__(self):
         Raises:
             StopIteration: Raised when there are no additional objects.
         """
-        for point_row in self.reader:
+        more_rows = True
+        while more_rows:
             try:
+                point_row = next(self.reader)
                 point_dict = {
                     term: self.fields[term](point_row) for term in self.fields.keys()
                 }
@@ -514,10 +519,16 @@ def __next__(self):
                         return tmp
                     self._curr_val = test_val
                 self._next_points.append(pt)
+            except IndexError:
+                pass
             except KeyError:
                 pass
             except TypeError:
                 pass
+            except StopIteration:
+                more_rows = False
+            except csv.Error:
+                pass
 
         if self._next_points:
             tmp = self._next_points

diff --git a/lmpy/tools/split_occurrence_data.py b/lmpy/tools/split_occurrence_data.py
@@ -103,6 +103,10 @@ def cli():
     parser = build_parser()
     args = _process_arguments(parser, 'config_file')
 
+    # Default key field is 'species_name'
+    if args.key_field is None:
+        args.key_field = ['species_name']
+
     # Establish functions for getting writer key and filename
     writer_key_func = get_writer_key_from_fields_func(*tuple(args.key_field))
     writer_filename_func = get_writer_filename_func(args.out_dir)

diff --git a/sample_data/name_map/croc_name_map.json b/sample_data/name_map/croc_name_map.json
@@ -1 +1,48 @@
-{}
+{
+    "Crocodylus porosus": "Crocodylus porosus",
+    "Crocodylus johnstoni": "Crocodylus johnstoni",
+    "Crocodylus": "Crocodylus",
+    "Crocodylus niloticus cowiei (smith in hewitt, 1937)": "Crocodylus niloticus cowiei",
+    "Crocodylus niloticus laurenti, 1768": "Crocodylus niloticus",
+    "Crocodylus acutus (cuvier, 1807)": "Crocodylus acutus",
+    "Crocodylus siamensis schneider, 1801": "Crocodylus siamensis",
+    "Crocodylus moreletii (dum\u00e9ril & bibron, 1851)": "Crocodylus moreletii",
+    "Crocodylus porosus schneider, 1801": "Crocodylus porosus",
+    "Crocodylus palustris (lesson, 1831)": "Crocodylus palustris",
+    "Crocodylus suchus geoffroy saint-hilaire, 1807": "Crocodylus suchus",
+    "Crocodylus johnsoni krefft, 1873": "Crocodylus johnsoni",
+    "Crocodylus mindorensis schmidt, 1935": "Crocodylus mindorensis",
+    "Crocodylus niloticus africanus laurenti, 1768": "Crocodylus niloticus",
+    "Crocodylus johnstoni krefft, 1873": "Crocodylus johnstoni",
+    "Crocodylus niloticus pauciscutatus deraniyagala, 1948": "Crocodylus niloticus",
+    "Crocodylus laurenti, 1768": "Crocodylus",
+    "Crocodylus intermedius (graves, 1819)": "Crocodylus intermedius",
+    "Crocodylus rhombifer (cuvier, 1807)": "Crocodylus rhombifer",
+    "Crocodylus niloticus niloticus": "Crocodylus niloticus niloticus",
+    "Crocodylus niloticus chamses bory de saint vincent, 1824": "Crocodylus niloticus",
+    "Crocodylus halli murray, russo, zorilla & mcmahan, 2019": "Crocodylus halli",
+    "Bold:aac4128": "BOLD:AAC4128",
+    "Bold:aar5109": "BOLD:AAR5109",
+    "Bold:adc8567": "BOLD:ADC8567",
+    "Bold:aac4127": "BOLD:AAC4127",
+    "Crocodylus novaeguineae schmidt, 1928": "Crocodylus novaeguineae",
+    "Crocodilus antiquus leidy, 1852": "Crocodilus antiquus",
+    "Crocodilus ziphodon marsh, 1871": "Crocodilus ziphodon",
+    "Crocodylus greenwoodi suneja et al., 1977": "Crocodylus greenwoodi",
+    "Crocodylus falconensis scheyer, aguilera, delfino, fortier, carlini, s\u00e1nchez, carrillo-brice\u00f1o, quiroz & s\u00e1nchez-villagra, 2013": "Crocodylus falconensis",
+    "Crocodylus checchiai maccagno, 1947": "Crocodylus checchiai",
+    "Crocodylus megarhinus (andrews, 1905)": "Crocodylus megarhinus",
+    "Crocodylus anthropophagus brochu, njau, blumenschine & densmore, 2010": "Crocodylus anthropophagus",
+    "Crocodylus palaeindicus falconer, 1859": "Crocodylus palaeindicus",
+    "Crocodylus thorbjarnarsoni brochu & storrs, 2012": "Crocodylus thorbjarnarsoni",
+    "Crocodylus gariepensis pickford, 2003": "Crocodylus gariepensis",
+    "Crocodylus palustris palustris": "Crocodylus palustris palustris",
+    "Crocodylus novaeguineae mindorensis schmidt, 1935": "Crocodylus novaeguineae",
+    "Crocodilus laurenti, 1768": "Crocodilus",
+    "Crocodylus novaeguineae novaeguineae": "Crocodylus novaeguineae novaeguineae",
+    "Crocodylus palustris kimbula deraniyagala, 1936": "Crocodylus palustris",
+    "Crocodilus americanus laurenti, 1768": "Crocodilus americanus",
+    "Crocodylus raninus ross, 1990": "Crocodylus raninus",
+    "Crocodylus americanus laurenti, 1768": "Crocodylus americanus",
+    "Crocodilus porosus schneider, 1801": null
+}
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -282,7 +282,7 @@ def get_script_runner(console_script, script_module, args):
         cmd_args.extend(args)
 
         # Run command
-        subprocess.run(cmd_args)
+        subprocess.run(cmd_args, capture_output=True, check=True)
 
     return get_script_runner
 

diff --git a/tests/test_tutorials/test_aggregate_occurrence_data.py b/tests/test_tutorials/test_aggregate_occurrence_data.py
@@ -25,10 +25,7 @@ def wrangler_configs(data_dir):
         'gbif': [
             dict(
                 wrangler_type='AcceptedNameOccurrenceWrangler',
-                name_resolver='gbif',
-                out_map_filename=os.path.join(
-                    data_dir, 'name_map/croc_name_map.json'
-                )
+                name_map=os.path.join(data_dir, 'name_map/croc_name_map.json'),
             ),
             dict(
                 wrangler_type='AttributeFilterWrangler',
@@ -56,7 +53,7 @@ def wrangler_configs(data_dir):
         'idigbio': [
             dict(
                 wrangler_type='AcceptedNameOccurrenceWrangler',
-                name_map=os.path.join(data_dir, 'name_map/croc_name_map.json')
+                name_map=os.path.join(data_dir, 'name_map/croc_name_map.json'),
             ),
             dict(
                 wrangler_type='AttributeFilterWrangler',
@@ -98,10 +95,7 @@ def wrangler_configs(data_dir):
         'ala': [
             dict(
                 wrangler_type='AcceptedNameOccurrenceWrangler',
-                out_map_filename=os.path.join(
-                    data_dir,
-                    'name_map/croc_name_map.json'
-                )
+                name_map=os.path.join(data_dir, 'name_map/croc_name_map.json'),
             ),
             dict(
                 wrangler_type='AttributeModifierWrangler',
@@ -122,11 +116,17 @@ def wrangler_configs(data_dir):
 
 
 # .....................................................................................
-def test_instructions_python(tutorial_data_dir, generate_temp_filename, temp_directory):
+def test_instructions_python(
+    tutorial_data_dir,
+    generate_temp_filename,
+    temp_directory
+):
     """Test the python instructions.
 
     Args:
         tutorial_data_dir (pytest.Fixture): The tutorial data directory.
+        generate_temp_filename (pytest.Fixture): A fixture to generate filenames.
+        temp_directory (pytest.Fixture): A temporary directory to write outputs.
     """
     gbif_dwca_filename = os.path.join(tutorial_data_dir, 'occurrence/gbif.zip')
     idigbio_dwca_filename = os.path.join(tutorial_data_dir, 'occurrence/idigbio.zip')
@@ -169,10 +169,17 @@ def test_instructions_python(tutorial_data_dir, generate_temp_filename, temp_dir
       writer_filename_func,
       write_fields=write_fields,
     ) as occurrence_processor:
-        for reader, wranglers in readers_and_wranglers[1:2]:
+        for reader, wranglers in readers_and_wranglers[2:]:
             occurrence_processor.process_reader(reader, wranglers)
         occurrence_processor.write_species_list(species_list_filename)
 
+    # Check the outputs
+    _validate_outputs(
+        species_list_filename,
+        out_dir,
+        speces_name_map
+    )
+
 
 # .....................................................................................
 def test_instructions_console_script(
@@ -216,7 +223,7 @@ def test_instructions_console_script(
         )
 
     script_args = [
-        f'--species_list_fileanme={species_list_filename}',
+        f'--species_list_filename={species_list_filename}',
         '--dwca',
         gbif_dwca_filename,
         gbif_wranglers_filename,
@@ -236,3 +243,40 @@ def test_instructions_console_script(
         'lmpy.tools.split_occurrence_data',
         script_args
     )
+
+    # Check the outputs
+    _validate_outputs(
+        species_list_filename,
+        output_dir,
+        os.path.join(tutorial_data_dir, 'name_map/croc_name_map.json')
+    )
+
+
+# .....................................................................................
+def _validate_outputs(species_list_filename, output_dir, accepted_names_filename):
+    """Validate outputs to ensure they are what we expect.
+
+    Args:
+        species_list_filename (str): File containing species seen.
+        output_dir (str): Directory where outputs are stored.
+        accepted_names_filename (str): File containing accepted names mapping.
+    """
+    # Load accepted names
+    with open(accepted_names_filename, mode='rt') as in_species:
+        accepted_names = [
+            val.lower() for val in json.load(in_species).values() if val is not None
+        ]
+
+    with open(species_list_filename, mode='rt') as species_list_in:
+        for line in species_list_in:
+            species = line.strip()
+            assert species.lower() not in ['null', 'none']
+            species_filename = os.path.join(output_dir, f'{species}.csv')
+            assert os.path.exists(species_filename)
+            sp_point_count = 0
+            with PointCsvReader(species_filename, 'species_name', 'x', 'y') as reader:
+                for points in reader:
+                    for point in points:
+                        sp_point_count += 1
+                        assert point.species_name.lower() in accepted_names
+            assert sp_point_count > 0