From 80ca08c56d7cf1403c2146c4e6f3cffb496f594b Mon Sep 17 00:00:00 2001 From: blundski Date: Tue, 9 May 2023 10:03:33 +0200 Subject: [PATCH] v4.3.0 (#146) --- VERSION | 2 +- .../scripts/load_time_series_from_file.py | 11 ++ .../services/csv_time_series_loader.py | 3 + .../services/file_time_series_loader.py | 23 ++- .../services/file_time_series_parser.py | 11 +- .../timeseries_with_mixedcase_columns.csv | 6 + .../scripts/test_load_time_series_from_csv.py | 153 ++++++++++++++++++ 7 files changed, 198 insertions(+), 11 deletions(-) create mode 100644 exabel_data_sdk/tests/resources/data/timeseries_with_mixedcase_columns.csv diff --git a/VERSION b/VERSION index 6aba2b2..8089590 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -4.2.0 +4.3.0 diff --git a/exabel_data_sdk/scripts/load_time_series_from_file.py b/exabel_data_sdk/scripts/load_time_series_from_file.py index 8431741..62ca580 100644 --- a/exabel_data_sdk/scripts/load_time_series_from_file.py +++ b/exabel_data_sdk/scripts/load_time_series_from_file.py @@ -119,6 +119,16 @@ def __init__(self, argv: Sequence[str]): default=False, help="If set, the time series are not validated before uploading.", ) + self.parser.add_argument( + "--case-sensitive-signals", + required=False, + action="store_true", + default=False, + help="If set, signal names are treated case sensitive. Note that this will disable " + "lowercasing of other column headers as well, as entities, 'date', and " + "'known_time'. Take care to maintain correct casing in the file when using this " + "option.", + ) def run_script(self, client: ExabelClient, args: argparse.Namespace) -> None: try: @@ -138,6 +148,7 @@ def run_script(self, client: ExabelClient, args: argparse.Namespace) -> None: retries=args.retries, batch_size=args.batch_size, skip_validation=args.skip_validation, + case_sensitive_signals=args.case_sensitive_signals, ) except FileLoadingException as e: print(e) diff --git a/exabel_data_sdk/services/csv_time_series_loader.py b/exabel_data_sdk/services/csv_time_series_loader.py index 4d5d2c3..13e10e5 100644 --- a/exabel_data_sdk/services/csv_time_series_loader.py +++ b/exabel_data_sdk/services/csv_time_series_loader.py @@ -37,6 +37,7 @@ def load_time_series( retries: int = DEFAULT_NUMBER_OF_RETRIES, abort_threshold: Optional[float] = 0.5, skip_validation: bool = False, + case_sensitive_signals: bool = False, # Deprecated arguments namespace: Optional[str] = None, # pylint: disable=unused-argument ) -> FileLoadingResult: @@ -69,6 +70,7 @@ def load_time_series( abort_threshold: the threshold for the proportion of failed requests that will cause the upload to be aborted; if it is `None`, the upload is never aborted skip_validation: if True, the time series are not validated before uploading + case_sensitive_signals: if True, signals are case sensitive """ results = FileTimeSeriesLoader(self._client).load_time_series( filename=filename, @@ -86,6 +88,7 @@ def load_time_series( retries=retries, abort_threshold=abort_threshold, skip_validation=skip_validation, + case_sensitive_signals=case_sensitive_signals, ) if len(results) != 1: raise ValueError("Unexpected number of results from time series loading.") diff --git a/exabel_data_sdk/services/file_time_series_loader.py b/exabel_data_sdk/services/file_time_series_loader.py index 2a67d26..5cdbb47 100644 --- a/exabel_data_sdk/services/file_time_series_loader.py +++ b/exabel_data_sdk/services/file_time_series_loader.py @@ -64,6 +64,7 @@ def load_time_series( abort_threshold: Optional[float] = 0.5, batch_size: Optional[int] = None, skip_validation: bool = False, + case_sensitive_signals: bool = False, return_results: bool = True, # Deprecated arguments namespace: Optional[str] = None, # pylint: disable=unused-argument @@ -104,6 +105,7 @@ def load_time_series( batch_size: the number of rows to read and upload in each batch; if not specified, the entire file will be read into memory and uploaded in a single batch skip_validation: if True, the time series are not validated before uploading + case_sensitive_signals: if True, signals are case sensitive """ if batch_size is not None: logger.info( @@ -139,6 +141,7 @@ def load_time_series( retries=retries, abort_threshold=abort_threshold, skip_validation=skip_validation, + case_sensitive_signals=case_sensitive_signals, ) if return_results: results.append(result) @@ -163,6 +166,7 @@ def _load_time_series( retries: int = DEFAULT_NUMBER_OF_RETRIES, abort_threshold: Optional[float] = 0.5, skip_validation: bool = False, + case_sensitive_signals: bool = False, ) -> TimeSeriesFileLoadingResult: """ Load time series from a parser. @@ -204,6 +208,7 @@ def _load_time_series( self._client.namespace, entity_mapping, entity_type=identifier_type or entity_type, + case_sensitive_signals=case_sensitive_signals, ) break if parsed_file is None: @@ -264,22 +269,26 @@ def _load_time_series( # Signals are reversed because we want to match the first signal returned by the API # lexicographically. all_signals = { - signal.name.lower(): signal + signal.name if case_sensitive_signals else signal.name.lower(): signal for signal in reversed(list(self._client.signal_api.get_signal_iterator())) } missing_signals = [] signals_to_rename = {} for signal in signals: - lowered_signal_name = prefix + signal.lower() - if lowered_signal_name not in all_signals: - missing_signals.append(lowered_signal_name) - if isinstance(parsed_file, SignalNamesInRows) and signal != signal.lower(): + signal_name = prefix + signal if case_sensitive_signals else prefix + signal.lower() + if signal_name not in all_signals: + missing_signals.append(signal_name) + if ( + not case_sensitive_signals + and isinstance(parsed_file, SignalNamesInRows) + and signal != signal.lower() + ): signals_to_rename[signal] = signal.lower() else: - signal_match = all_signals[lowered_signal_name] + signal_match = all_signals[signal_name] signal_name = signal_match.name.split(".")[-1] - if lowered_signal_name != signal_match.name or ( + if signal_name != signal_match.name or ( isinstance(parsed_file, SignalNamesInRows) and signal != signal_name ): signals_to_rename[signal] = signal_name diff --git a/exabel_data_sdk/services/file_time_series_parser.py b/exabel_data_sdk/services/file_time_series_parser.py index be20311..ce627e7 100644 --- a/exabel_data_sdk/services/file_time_series_parser.py +++ b/exabel_data_sdk/services/file_time_series_parser.py @@ -104,7 +104,10 @@ def sheet_name(self) -> Optional[str]: return str(self.worksheet) if self.worksheet is not None else None def parse_file( - self, nrows: Optional[int] = None, header: Optional[Sequence[int]] = None + self, + nrows: Optional[int] = None, + header: Optional[Sequence[int]] = None, + case_sensitive_signals: bool = False, ) -> pd.DataFrame: """Parse the file as a Pandas data frame.""" extension = Path(self.filename).suffix.lower() @@ -136,7 +139,7 @@ def parse_file( ) else: raise FileLoadingException(f"Unknown file extension '{extension}'") - if not df.empty: + if not df.empty and not case_sensitive_signals: df = df.rename(lambda n: n.lower(), axis="columns", level=0) return df @@ -216,9 +219,10 @@ def from_file( namespace: str, entity_mapping: Optional[Mapping[str, Mapping[str, str]]] = None, entity_type: Optional[str] = None, + case_sensitive_signals: bool = False, ) -> "ParsedTimeSeriesFile": """Read a file and construct a parsed file from the contents.""" - data = file_parser.parse_file() + data = file_parser.parse_file(case_sensitive_signals=case_sensitive_signals) return cls.from_data_frame(data, entity_api, namespace, entity_mapping, entity_type) @classmethod @@ -710,6 +714,7 @@ def from_file( namespace: str, entity_mapping: Optional[Mapping[str, Mapping[str, str]]] = None, entity_type: Optional[str] = None, + case_sensitive_signals: bool = False, ) -> "ParsedTimeSeriesFile": """Read a file and construct a new parser from the contents of that file.""" if entity_type is not None: diff --git a/exabel_data_sdk/tests/resources/data/timeseries_with_mixedcase_columns.csv b/exabel_data_sdk/tests/resources/data/timeseries_with_mixedcase_columns.csv new file mode 100644 index 0000000..9bb0220 --- /dev/null +++ b/exabel_data_sdk/tests/resources/data/timeseries_with_mixedcase_columns.csv @@ -0,0 +1,6 @@ +brand;date;known_time;Signal1 +A_brand;2021-01-01;2021-01-01;1 +A_brand;2021-01-02;2021-01-02;2 +A_brand;2021-01-03;2021-01-03;3 +A_brand;2021-01-04;2021-01-04;4 +A_brand;2021-01-05;2021-01-05;5 diff --git a/exabel_data_sdk/tests/scripts/test_load_time_series_from_csv.py b/exabel_data_sdk/tests/scripts/test_load_time_series_from_csv.py index 98420be..2cd3923 100644 --- a/exabel_data_sdk/tests/scripts/test_load_time_series_from_csv.py +++ b/exabel_data_sdk/tests/scripts/test_load_time_series_from_csv.py @@ -698,6 +698,156 @@ def test_load_time_series_with_uppercase_signals_existing_and_uppercase_entity_t check_freq=False, ) + def test_load_time_series_with_uppercase_signals_not_existing_case_sensitive(self): + args = common_args + [ + "--filename", + "./exabel_data_sdk/tests/resources/data/timeseries_with_mixedcase_columns.csv", + "--create-missing-signals", + "--case-sensitive-signals", + ] + script = LoadTimeSeriesFromFile(args) + self.client.signal_api.get_signal.return_value = None + self.client.signal_api.get_signal_iterator.side_effect = lambda *_: PagingResult([], "", 0) + self.client.entity_api.get_entity_type_iterator.side_effect = ( + self._list_entity_types_uppercase + ) + + script.run_script(self.client, script.parse_arguments()) + + call_args_list = self.client.time_series_api.bulk_upsert_time_series.call_args_list + self.assertEqual(1, len(call_args_list)) + series = call_args_list[0][0][0] + self.assertEqual(1, len(series)) + call_args_list_create_signal = self.client.signal_api.create_signal.call_args_list + self.assertEqual(1, len(call_args_list_create_signal)) + signal = call_args_list_create_signal[0][0][0] + self.assertEqual("signals/ns.Signal1", signal.name) + + pd.testing.assert_series_equal( + pd.Series( + [1, 2, 3, 4, 5], + pd.MultiIndex.from_arrays( + [ + pd.date_range("2021-01-01", periods=5, tz=tz.tzutc()), + pd.date_range("2021-01-01", periods=5, tz=tz.tzutc()), + ] + ), + name="entityTypes/BRAND/entities/ns.A_brand/signals/ns.Signal1", + ), + series[0], + check_freq=False, + ) + + def test_load_time_series_with_uppercase_signals_and_lower_case_existing_case_sensitive(self): + args = common_args + [ + "--filename", + "./exabel_data_sdk/tests/resources/data/timeseries_with_mixedcase_columns.csv", + "--create-missing-signals", + "--case-sensitive-signals", + ] + script = LoadTimeSeriesFromFile(args) + self.client.signal_api.get_signal_iterator.side_effect = self._list_signal + self.client.signal_api.get_signal.return_value = None + self.client.signal_api.get_signal_iterator.side_effect = lambda *_: PagingResult([], "", 0) + self.client.entity_api.get_entity_type_iterator.side_effect = ( + self._list_entity_types_uppercase + ) + + script.run_script(self.client, script.parse_arguments()) + + call_args_list = self.client.time_series_api.bulk_upsert_time_series.call_args_list + self.assertEqual(1, len(call_args_list)) + series = call_args_list[0][0][0] + self.assertEqual(1, len(series)) + call_args_list_create_signal = self.client.signal_api.create_signal.call_args_list + self.assertEqual(1, len(call_args_list_create_signal)) + signal = call_args_list_create_signal[0][0][0] + self.assertEqual("signals/ns.Signal1", signal.name) + + pd.testing.assert_series_equal( + pd.Series( + [1, 2, 3, 4, 5], + pd.MultiIndex.from_arrays( + [ + pd.date_range("2021-01-01", periods=5, tz=tz.tzutc()), + pd.date_range("2021-01-01", periods=5, tz=tz.tzutc()), + ] + ), + name="entityTypes/BRAND/entities/ns.A_brand/signals/ns.Signal1", + ), + series[0], + check_freq=False, + ) + + def test_load_time_series_with_uppercase_signals_existing_case_sensitive(self): + args = common_args + [ + "--filename", + "./exabel_data_sdk/tests/resources/data/timeseries_with_mixedcase_columns.csv", + "--create-missing-signals", + "--case-sensitive-signals", + ] + script = LoadTimeSeriesFromFile(args) + self.client.signal_api.get_signal_iterator.side_effect = self._list_signal_mixedcase + self.client.entity_api.get_entity_type_iterator.side_effect = self._list_entity_types + script.run_script(self.client, script.parse_arguments()) + + call_args_list = self.client.time_series_api.bulk_upsert_time_series.call_args_list + self.assertEqual(1, len(call_args_list)) + self.assertEqual(0, len(self.client.signal_api.create_signal.call_args_list)) + series = call_args_list[0][0][0] + self.assertEqual(1, len(series)) + + pd.testing.assert_series_equal( + pd.Series( + [1, 2, 3, 4, 5], + pd.MultiIndex.from_arrays( + [ + pd.date_range("2021-01-01", periods=5, tz=tz.tzutc()), + pd.date_range("2021-01-01", periods=5, tz=tz.tzutc()), + ] + ), + name="entityTypes/brand/entities/ns.A_brand/signals/ns.Signal1", + ), + series[0], + check_freq=False, + ) + + def test_load_time_series_with_mixedcase_signals_existing_and_entity_type_nonexisting_cs( + self, + ): + args = common_args + [ + "--filename", + "./exabel_data_sdk/tests/resources/data/timeseries_with_mixedcase_columns.csv", + "--create-missing-signals", + "--case-sensitive", + ] + script = LoadTimeSeriesFromFile(args) + self.client.signal_api.get_signal_iterator.side_effect = self._list_signal_mixedcase + self.client.entity_api.get_entity_type_iterator.side_effect = self._list_entity_types + + script.run_script(self.client, script.parse_arguments()) + + call_args_list = self.client.time_series_api.bulk_upsert_time_series.call_args_list + self.assertEqual(1, len(call_args_list)) + self.assertEqual(0, len(self.client.signal_api.create_signal.call_args_list)) + series = call_args_list[0][0][0] + self.assertEqual(1, len(series)) + + pd.testing.assert_series_equal( + pd.Series( + [1, 2, 3, 4, 5], + pd.MultiIndex.from_arrays( + [ + pd.date_range("2021-01-01", periods=5, tz=tz.tzutc()), + pd.date_range("2021-01-01", periods=5, tz=tz.tzutc()), + ] + ), + name="entityTypes/brand/entities/ns.A_brand/signals/ns.Signal1", + ), + series[0], + check_freq=False, + ) + def _list_signal(self): return iter( [ @@ -709,6 +859,9 @@ def _list_signal(self): def _list_signal_uppercase(self): return iter([Signal("signals/ns.SIGNAL1", "The Signal", "A description of the signal")]) + def _list_signal_mixedcase(self): + return iter([Signal("signals/ns.Signal1", "The Signal", "A description of the signal")]) + def _list_entity_types(self): return iter([EntityType("entityTypes/brand", "", "", False)])