From 5ad13d9dd40e593675f941270dbcb2616e2a8114 Mon Sep 17 00:00:00 2001 From: Konstantin Stadler Date: Thu, 29 Aug 2024 17:17:59 +0200 Subject: [PATCH] with first draft of spreading to multiple index --- pymrio/tools/ioutil.py | 44 +++++++++++++++++++++----- tests/test_util.py | 70 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 104 insertions(+), 10 deletions(-) diff --git a/pymrio/tools/ioutil.py b/pymrio/tools/ioutil.py index 6f22af4c..d06918be 100644 --- a/pymrio/tools/ioutil.py +++ b/pymrio/tools/ioutil.py @@ -1110,7 +1110,7 @@ def convert( if isinstance(df_orig, pd.Series): df_orig = pd.DataFrame(df_orig) - # some consitency checks of arguments and restructuring if everything is ok + # some consistency checks of arguments and restructuring if everything is ok if len(bridge_columns) == 0: raise ValueError("No columns with '__' in the mapping DataFrame") for col in bridge_columns: @@ -1149,6 +1149,7 @@ def convert( res_collector = [] # loop over each new impact/characterized value + # and collect entries, multiply and rename for entry in unique_new_index: df_cur_map = df_map.loc[[entry]] collector = [] @@ -1164,32 +1165,59 @@ def convert( df_collected = pd.concat(collector, axis=0) + # renaming part, checks if the old name (bridge.orig) is in the current index + # and renames by the new one (bridge.new) + + already_renamed = dict() for bridge in bridges: + # encountering a bridge with the same orig name but which should + # lead to two new index levels + if bridge.orig in already_renamed.keys(): + # duplicate the index level + df_collected.reset_index(level=already_renamed[bridge.orig].new, inplace=True) + df_collected[bridge.new] = df_cur_map.index.get_level_values(bridge.raw)[0] + + if df_collected.index.name is None: + df_collected.set_index(already_renamed[bridge.orig].new, drop=True, append=False, inplace=True) + else: + df_collected.set_index(already_renamed[bridge.orig].new, drop=True, append=True, inplace=True) + df_collected.set_index(bridge.new, drop=True, append=True, inplace=True) + continue + for idx_old_names in df_collected.index.names: if bridge.orig in idx_old_names: + # rename the index names if isinstance(df_collected.index, pd.MultiIndex): df_collected.index = df_collected.index.set_names( bridge.new, level=idx_old_names) else: df_collected.index = df_collected.index.set_names( bridge.new, level=None) + # rename the actual index values df_collected.reset_index(level=bridge.new, inplace=True) - for row in df_cur_map.reset_index().iterrows(): new_row_name = row[1][bridge.raw] old_row_name = row[1][bridge.orig] df_collected.loc[:, bridge.new] = df_collected.loc[ :, bridge.new ].str.replace(pat=old_row_name, repl=new_row_name, regex=True) - df_collected.set_index( - # CONT: Make test cases for renaming/chacterization of a df without a multiindex # CONT: Make a test case/method where a matching line gets extended into more index columns - bridge.new, drop=True, append=True, inplace=True - ) + # CONT: Ensure that the spread keeps the order as in the original mapping + + # put the index back + if df_collected.index.name is None: + # The case with a single index where the previous reset index + # left only a numerical index + df_collected.set_index( + bridge.new, drop=True, append=False, inplace=True + ) + else: + df_collected.set_index( + bridge.new, drop=True, append=True, inplace=True + ) + already_renamed[bridge.orig] = bridge res_collector.append( - df_collected.groupby(by=df_collected.index.names).agg(agg_func) - ) all_result = pd.concat(res_collector, axis=0) diff --git a/tests/test_util.py b/tests/test_util.py index c0029bd0..6643771c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -354,9 +354,34 @@ def test_util_regex(): assert len(df_none_match) == 0 assert len(df_none_match_index) == 0 +def test_convert_rename_singleindex(): + """Testing the renaming of one table with a single index""" -def test_convert_rename(): - """Testing the renaming of one table""" + to_char = pd.DataFrame( + data=99.0, + index=["em1", "em2", "em3"], + columns=["r1", "r2", "r3"] + ) + to_char.index.name = "em_type" + to_char.columns.name = "reg" + + rename_bridge_simple = pd.DataFrame( + columns=["em_type", "stressor__em_type"], + data=[ + ["em1", "emission1"], + ["em2", "emission2"], + ["em3", "emission3"], + ], + ) + + renamed = convert(to_char, rename_bridge_simple) + assert all(renamed.columns == renamed.columns) + assert all(renamed.index == rename_bridge_simple["stressor__em_type"]) + + + +def test_convert_rename_multiindex(): + """Testing the renaming of one table with a multiindex""" to_char = pd.DataFrame( data=99.0, @@ -440,6 +465,47 @@ def test_convert_rename(): pdt.assert_frame_equal(char_res_keep_comp_wo_factor, char_res_keep_comp) +def test_convert_rename_spread_index(): + """Testing the renaming of one table from an index to an multiindex + + This is a specific case for the EXIOBASE to GLAM conversion, + where one stressor level need to be spread to multiple flows/classes + """ + + to_char = pd.DataFrame( + data=99.0, + index=["em1", "em2", "em3"], + columns=["r1", "r2", "r3"] + ) + to_char.index.name = "stressor" + to_char.columns.name = "reg" + + rename_bridge = pd.DataFrame( + columns=["stressor", "flow__stressor", "class__stressor", "class2__stressor"], + data=[ + ["em1", "emission1", "to_air", "to_air (unspecified)"], + ["em2", "emission2", "to_air", "to_air (specified)"], + ["em3", "emission3", "to_water", "to_water (unpecified)"],], + ) + + + rename_bridge = pd.DataFrame( + columns=["stressor", "class__stressor", "class2__stressor"], + data=[ + ["em1", "to_air", "to_air (unspecified)"], + ["em2", "to_air", "to_air (specified)"], + ["em3", "to_water", "to_water (unpecified)"],], + ) + + + renamed = convert(to_char, rename_bridge) + + assert all(renamed.columns == renamed.columns) + assert all(renamed.index == rename_bridge_simple["stressor__em_type"]) + + + + def test_convert_characterize(): """Testing the characterization of one table"""