Skip to content

Commit

Permalink
suport for non multiple array
Browse files Browse the repository at this point in the history
  • Loading branch information
Maria Gorodetski committed Sep 3, 2024
1 parent 2e24587 commit ded1adc
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 48 deletions.
22 changes: 11 additions & 11 deletions nbs/01_basic_plots.ipynb

Large diffs are not rendered by default.

38 changes: 11 additions & 27 deletions nbs/13_questionnaire_handler.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"metadata": {},
"outputs": [],
"source": [
"#| default_exp questionnaires_handler\n"
"#| default_exp questionnaires_handler"
]
},
{
Expand Down Expand Up @@ -237,15 +237,18 @@
" df_coding = df.copy()\n",
" if preferred_language == 'coding':\n",
" for column in fields_for_translation:\n",
" data_coding = dict_df.loc[column, 'data_coding'] # This actually should happen\n",
" data_coding = dict_df.loc[column, 'data_coding']\n",
" if isinstance(data_coding, pd.Series): # In case of multiple entries, take the first one\n",
" data_coding = data_coding.iloc[0]\n",
" \n",
" if pd.notna(data_coding):\n",
" if dict_df.loc[column, 'array'] == 'Multiple':\n",
" try:\n",
" df_coding[column] = df[column].apply(lambda x : pd.Series(x).astype('Int16') if isinstance(x, np.ndarray) else x)\n",
" except: \n",
" print(column)\n",
" field_array = dict_df.loc[column, 'array']\n",
" if isinstance(field_array, pd.Series):\n",
" field_array = field_array.iloc[0]\n",
" if field_array == 'Multiple':\n",
" continue\n",
" else: \n",
" df_coding[column] = df[column].astype('Int16')\n",
" df_coding[column] = df[column].astype('Int16', errors='ignore')\n",
" dict_df.loc[column, 'pandas_dtype'] = 'Int16'\n",
" \n",
" return df_coding\n",
Expand All @@ -269,7 +272,6 @@
" df = convert_codings_to_int(df=df, dict_df=dict_df, fields_for_translation=fields_for_translation, \n",
" preferred_language=transform_to)\n",
" \n",
" print('#1', df.info())\n",
" return df\n",
" \n",
" \n",
Expand All @@ -292,33 +294,15 @@
" \n",
" transformed_df = convert_codings_to_int(df=transformed_df, dict_df=dict_df, fields_for_translation=fields_for_translation, \n",
" preferred_language=transform_to)\n",
" print('#2', df.info())\n",
" return transformed_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# dataset = 'sleep'\n",
"# input_path = f'/home/ec2-user/workspace/pheno-utils/nbs/examples/{dataset}/metadata/{dataset}_data_dictionary.csv'\n",
"# df = pd.read_csv(input_path)\n",
"# df['data_coding'] = None\n",
"# df.to_csv(input_path, index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "python3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion pheno_utils/basic_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_gender_indices(df, gender='male', gender_col='sex'):
raise ValueError(f"Column '{gender_col}' does not exist in the DataFrame")

# Check the data type of the gender column
if df[gender_col].dtype == 'int64':
if pd.api.types.is_integer_dtype(df[gender_col]):
if gender == 'male':
indices = df[df[gender_col] == 1].index
else:
Expand Down
19 changes: 10 additions & 9 deletions pheno_utils/questionnaires_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,18 @@ def convert_codings_to_int(df: pd.DataFrame, dict_df: pd.DataFrame, fields_for_t
df_coding = df.copy()
if preferred_language == 'coding':
for column in fields_for_translation:
data_coding = dict_df.loc[column, 'data_coding'] # This actually should happen
data_coding = dict_df.loc[column, 'data_coding']
if isinstance(data_coding, pd.Series): # In case of multiple entries, take the first one
data_coding = data_coding.iloc[0]

if pd.notna(data_coding):
if dict_df.loc[column, 'array'] == 'Multiple':
try:
df_coding[column] = df[column].apply(lambda x : pd.Series(x).astype('Int16') if isinstance(x, np.ndarray) else x)
except:
print(column)
field_array = dict_df.loc[column, 'array']
if isinstance(field_array, pd.Series):
field_array = field_array.iloc[0]
if field_array == 'Multiple':
continue
else:
df_coding[column] = df[column].astype('Int16')
df_coding[column] = df[column].astype('Int16', errors='ignore')
dict_df.loc[column, 'pandas_dtype'] = 'Int16'

return df_coding
Expand All @@ -212,7 +215,6 @@ def transform_dataframe(
df = convert_codings_to_int(df=df, dict_df=dict_df, fields_for_translation=fields_for_translation,
preferred_language=transform_to)

print('#1', df.info())
return df


Expand All @@ -235,5 +237,4 @@ def transform_dataframe(

transformed_df = convert_codings_to_int(df=transformed_df, dict_df=dict_df, fields_for_translation=fields_for_translation,
preferred_language=transform_to)
print('#2', df.info())
return transformed_df

0 comments on commit ded1adc

Please sign in to comment.