diff --git a/lmpy/point.py b/lmpy/point.py index 44c1e2a5..692f3120 100644 --- a/lmpy/point.py +++ b/lmpy/point.py @@ -174,6 +174,7 @@ def __init__( y_field, geopoint=None, group_field='species_name', + encoding='utf8', ): """Constructor for a Point CSV retriever. @@ -188,6 +189,7 @@ def __init__( geopoint (:obj:`str`): The field name of the column containing geopoint data. group_field (:obj:`str`): The name of the field to use for grouping points. + encoding (str): The encoding to use when opening the file. """ self.filename = filename self.file = None @@ -199,6 +201,7 @@ def __init__( self.group_field = group_field self._next_points = [] self._curr_val = None + self.encoding = encoding # ....................... def __enter__(self): @@ -275,7 +278,7 @@ def __next__(self): # ....................... def open(self): """Open the file and initialize.""" - self.file = open(self.filename, 'r') + self.file = open(self.filename, 'r', encoding=self.encoding) temp_lines = [] try: for _ in range(3): @@ -494,8 +497,10 @@ def __next__(self): Raises: StopIteration: Raised when there are no additional objects. """ - for point_row in self.reader: + more_rows = True + while more_rows: try: + point_row = next(self.reader) point_dict = { term: self.fields[term](point_row) for term in self.fields.keys() } @@ -514,10 +519,16 @@ def __next__(self): return tmp self._curr_val = test_val self._next_points.append(pt) + except IndexError: + pass except KeyError: pass except TypeError: pass + except StopIteration: + more_rows = False + except csv.Error: + pass if self._next_points: tmp = self._next_points diff --git a/lmpy/tools/split_occurrence_data.py b/lmpy/tools/split_occurrence_data.py index 0d187a19..6ed54161 100644 --- a/lmpy/tools/split_occurrence_data.py +++ b/lmpy/tools/split_occurrence_data.py @@ -103,6 +103,10 @@ def cli(): parser = build_parser() args = _process_arguments(parser, 'config_file') + # Default key field is 'species_name' + if args.key_field is None: + args.key_field = ['species_name'] + # Establish functions for getting writer key and filename writer_key_func = get_writer_key_from_fields_func(*tuple(args.key_field)) writer_filename_func = get_writer_filename_func(args.out_dir) diff --git a/sample_data/name_map/croc_name_map.json b/sample_data/name_map/croc_name_map.json index 9e26dfee..170c1c7f 100644 --- a/sample_data/name_map/croc_name_map.json +++ b/sample_data/name_map/croc_name_map.json @@ -1 +1,48 @@ -{} \ No newline at end of file +{ + "Crocodylus porosus": "Crocodylus porosus", + "Crocodylus johnstoni": "Crocodylus johnstoni", + "Crocodylus": "Crocodylus", + "Crocodylus niloticus cowiei (smith in hewitt, 1937)": "Crocodylus niloticus cowiei", + "Crocodylus niloticus laurenti, 1768": "Crocodylus niloticus", + "Crocodylus acutus (cuvier, 1807)": "Crocodylus acutus", + "Crocodylus siamensis schneider, 1801": "Crocodylus siamensis", + "Crocodylus moreletii (dum\u00e9ril & bibron, 1851)": "Crocodylus moreletii", + "Crocodylus porosus schneider, 1801": "Crocodylus porosus", + "Crocodylus palustris (lesson, 1831)": "Crocodylus palustris", + "Crocodylus suchus geoffroy saint-hilaire, 1807": "Crocodylus suchus", + "Crocodylus johnsoni krefft, 1873": "Crocodylus johnsoni", + "Crocodylus mindorensis schmidt, 1935": "Crocodylus mindorensis", + "Crocodylus niloticus africanus laurenti, 1768": "Crocodylus niloticus", + "Crocodylus johnstoni krefft, 1873": "Crocodylus johnstoni", + "Crocodylus niloticus pauciscutatus deraniyagala, 1948": "Crocodylus niloticus", + "Crocodylus laurenti, 1768": "Crocodylus", + "Crocodylus intermedius (graves, 1819)": "Crocodylus intermedius", + "Crocodylus rhombifer (cuvier, 1807)": "Crocodylus rhombifer", + "Crocodylus niloticus niloticus": "Crocodylus niloticus niloticus", + "Crocodylus niloticus chamses bory de saint vincent, 1824": "Crocodylus niloticus", + "Crocodylus halli murray, russo, zorilla & mcmahan, 2019": "Crocodylus halli", + "Bold:aac4128": "BOLD:AAC4128", + "Bold:aar5109": "BOLD:AAR5109", + "Bold:adc8567": "BOLD:ADC8567", + "Bold:aac4127": "BOLD:AAC4127", + "Crocodylus novaeguineae schmidt, 1928": "Crocodylus novaeguineae", + "Crocodilus antiquus leidy, 1852": "Crocodilus antiquus", + "Crocodilus ziphodon marsh, 1871": "Crocodilus ziphodon", + "Crocodylus greenwoodi suneja et al., 1977": "Crocodylus greenwoodi", + "Crocodylus falconensis scheyer, aguilera, delfino, fortier, carlini, s\u00e1nchez, carrillo-brice\u00f1o, quiroz & s\u00e1nchez-villagra, 2013": "Crocodylus falconensis", + "Crocodylus checchiai maccagno, 1947": "Crocodylus checchiai", + "Crocodylus megarhinus (andrews, 1905)": "Crocodylus megarhinus", + "Crocodylus anthropophagus brochu, njau, blumenschine & densmore, 2010": "Crocodylus anthropophagus", + "Crocodylus palaeindicus falconer, 1859": "Crocodylus palaeindicus", + "Crocodylus thorbjarnarsoni brochu & storrs, 2012": "Crocodylus thorbjarnarsoni", + "Crocodylus gariepensis pickford, 2003": "Crocodylus gariepensis", + "Crocodylus palustris palustris": "Crocodylus palustris palustris", + "Crocodylus novaeguineae mindorensis schmidt, 1935": "Crocodylus novaeguineae", + "Crocodilus laurenti, 1768": "Crocodilus", + "Crocodylus novaeguineae novaeguineae": "Crocodylus novaeguineae novaeguineae", + "Crocodylus palustris kimbula deraniyagala, 1936": "Crocodylus palustris", + "Crocodilus americanus laurenti, 1768": "Crocodilus americanus", + "Crocodylus raninus ross, 1990": "Crocodylus raninus", + "Crocodylus americanus laurenti, 1768": "Crocodylus americanus", + "Crocodilus porosus schneider, 1801": null +} diff --git a/tests/conftest.py b/tests/conftest.py index c3cb19f4..bd3fb959 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -282,7 +282,7 @@ def get_script_runner(console_script, script_module, args): cmd_args.extend(args) # Run command - subprocess.run(cmd_args) + subprocess.run(cmd_args, capture_output=True, check=True) return get_script_runner diff --git a/tests/test_tutorials/test_aggregate_occurrence_data.py b/tests/test_tutorials/test_aggregate_occurrence_data.py index 9ca75d9f..62420d5c 100644 --- a/tests/test_tutorials/test_aggregate_occurrence_data.py +++ b/tests/test_tutorials/test_aggregate_occurrence_data.py @@ -25,10 +25,7 @@ def wrangler_configs(data_dir): 'gbif': [ dict( wrangler_type='AcceptedNameOccurrenceWrangler', - name_resolver='gbif', - out_map_filename=os.path.join( - data_dir, 'name_map/croc_name_map.json' - ) + name_map=os.path.join(data_dir, 'name_map/croc_name_map.json'), ), dict( wrangler_type='AttributeFilterWrangler', @@ -56,7 +53,7 @@ def wrangler_configs(data_dir): 'idigbio': [ dict( wrangler_type='AcceptedNameOccurrenceWrangler', - name_map=os.path.join(data_dir, 'name_map/croc_name_map.json') + name_map=os.path.join(data_dir, 'name_map/croc_name_map.json'), ), dict( wrangler_type='AttributeFilterWrangler', @@ -98,10 +95,7 @@ def wrangler_configs(data_dir): 'ala': [ dict( wrangler_type='AcceptedNameOccurrenceWrangler', - out_map_filename=os.path.join( - data_dir, - 'name_map/croc_name_map.json' - ) + name_map=os.path.join(data_dir, 'name_map/croc_name_map.json'), ), dict( wrangler_type='AttributeModifierWrangler', @@ -122,11 +116,17 @@ def wrangler_configs(data_dir): # ..................................................................................... -def test_instructions_python(tutorial_data_dir, generate_temp_filename, temp_directory): +def test_instructions_python( + tutorial_data_dir, + generate_temp_filename, + temp_directory +): """Test the python instructions. Args: tutorial_data_dir (pytest.Fixture): The tutorial data directory. + generate_temp_filename (pytest.Fixture): A fixture to generate filenames. + temp_directory (pytest.Fixture): A temporary directory to write outputs. """ gbif_dwca_filename = os.path.join(tutorial_data_dir, 'occurrence/gbif.zip') idigbio_dwca_filename = os.path.join(tutorial_data_dir, 'occurrence/idigbio.zip') @@ -169,10 +169,17 @@ def test_instructions_python(tutorial_data_dir, generate_temp_filename, temp_dir writer_filename_func, write_fields=write_fields, ) as occurrence_processor: - for reader, wranglers in readers_and_wranglers[1:2]: + for reader, wranglers in readers_and_wranglers[2:]: occurrence_processor.process_reader(reader, wranglers) occurrence_processor.write_species_list(species_list_filename) + # Check the outputs + _validate_outputs( + species_list_filename, + out_dir, + speces_name_map + ) + # ..................................................................................... def test_instructions_console_script( @@ -216,7 +223,7 @@ def test_instructions_console_script( ) script_args = [ - f'--species_list_fileanme={species_list_filename}', + f'--species_list_filename={species_list_filename}', '--dwca', gbif_dwca_filename, gbif_wranglers_filename, @@ -236,3 +243,40 @@ def test_instructions_console_script( 'lmpy.tools.split_occurrence_data', script_args ) + + # Check the outputs + _validate_outputs( + species_list_filename, + output_dir, + os.path.join(tutorial_data_dir, 'name_map/croc_name_map.json') + ) + + +# ..................................................................................... +def _validate_outputs(species_list_filename, output_dir, accepted_names_filename): + """Validate outputs to ensure they are what we expect. + + Args: + species_list_filename (str): File containing species seen. + output_dir (str): Directory where outputs are stored. + accepted_names_filename (str): File containing accepted names mapping. + """ + # Load accepted names + with open(accepted_names_filename, mode='rt') as in_species: + accepted_names = [ + val.lower() for val in json.load(in_species).values() if val is not None + ] + + with open(species_list_filename, mode='rt') as species_list_in: + for line in species_list_in: + species = line.strip() + assert species.lower() not in ['null', 'none'] + species_filename = os.path.join(output_dir, f'{species}.csv') + assert os.path.exists(species_filename) + sp_point_count = 0 + with PointCsvReader(species_filename, 'species_name', 'x', 'y') as reader: + for points in reader: + for point in points: + sp_point_count += 1 + assert point.species_name.lower() in accepted_names + assert sp_point_count > 0