Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coding fix #32

Merged
merged 12 commits into from
Nov 12, 2024
22 changes: 11 additions & 11 deletions nbs/01_basic_plots.ipynb

Large diffs are not rendered by default.

91 changes: 55 additions & 36 deletions nbs/13_questionnaire_handler.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"metadata": {},
"outputs": [],
"source": [
"#| default_exp questionnaires_handler\n"
"#| default_exp questionnaires_handler"
]
},
{
Expand Down Expand Up @@ -204,54 +204,77 @@
" else:\n",
" code_string = convert_to_string(dict_df.loc[tab_field_name][\"data_coding\"])\n",
" \n",
" #getting the data coding df from the large data coding csv\n",
" # Getting the data coding df from the large data coding csv\n",
" code_df = mapping_df[mapping_df[\"code_number\"] == code_string].copy()\n",
" \n",
" #Make sure no leading 0s for coding values\n",
" # Make sure no leading 0s for coding values\n",
" code_df[\"coding\"] = code_df[\"coding\"].apply(convert_to_string)\n",
" \n",
" mapping_dict = dict(zip(code_df[code_from].astype(str), code_df[code_to]))\n",
" \n",
" \n",
" #adding fail safe incase older dictionaries don't have field type : TODO potentaily remove once older dictionaires are updated\n",
" if 'field_type' in dict_df.columns:\n",
" field_type = dict_df.loc[tab_field_name]['field_type']\n",
" if isinstance(field_type, pd.Series):\n",
" if field_type.nunique() == 1:\n",
" field_type = field_type.iloc[0]\n",
" else:\n",
" warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
" return orig_answer\n",
" \n",
" if field_type == 'Categorical (multiple)': \n",
" normalise_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalise_answer , code_df)\n",
" transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
" field_type = dict_df.loc[tab_field_name]['field_type']\n",
" if isinstance(field_type, pd.Series):\n",
" if field_type.nunique() == 1:\n",
" field_type = field_type.iloc[0]\n",
" else:\n",
" #if categorical single\n",
" normalized_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalized_answer, code_df)\n",
" transformed_answer = normalized_answer.replace(mapping_dict)\n",
" transformed_answer = transformed_answer.astype(\"category\")\n",
"\n",
" return transformed_answer\n",
" warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
" return orig_answer\n",
" \n",
" if field_type == 'Categorical (multiple)': \n",
" normalise_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalise_answer , code_df)\n",
" transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
" else:\n",
" return orig_answer\n",
"\n",
" normalized_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalized_answer, code_df)\n",
" transformed_answer = normalized_answer.replace(mapping_dict)\n",
" transformed_answer = transformed_answer.astype(\"category\")\n",
"\n",
" return transformed_answer\n",
" \n",
"def convert_codings_to_int(df: pd.DataFrame, dict_df: pd.DataFrame, fields_for_translation: list, preferred_language: str) -> pd.DataFrame:\n",
" df_coding = df.copy()\n",
" if preferred_language == 'coding':\n",
" for column in fields_for_translation:\n",
" data_coding = dict_df.loc[column, 'data_coding']\n",
" if isinstance(data_coding, pd.Series): # In case of multiple entries, take the first one\n",
" data_coding = data_coding.iloc[0]\n",
" \n",
" if pd.notna(data_coding):\n",
" field_array = dict_df.loc[column, 'array']\n",
" if isinstance(field_array, pd.Series):\n",
" field_array = field_array.iloc[0]\n",
" if field_array == 'Multiple':\n",
" continue\n",
" else: \n",
" df_coding[column] = df[column].astype('Int16', errors='ignore')\n",
" dict_df.loc[column, 'pandas_dtype'] = 'Int16'\n",
" \n",
" return df_coding\n",
" \n",
" \n",
" \n",
"def transform_dataframe(\n",
" df: pd.DataFrame,\n",
" transform_from: str,\n",
" transform_to: str,\n",
" dict_df: pd.DataFrame,\n",
" mapping_df: pd.DataFrame,\n",
") -> pd.DataFrame:\n",
" if 'data_coding' not in dict_df.columns or transform_from == transform_to:\n",
" return df\n",
" \n",
" # Only fields with a code in data_coding property will be transformed\n",
" fields_for_translation = dict_df[pd.notna(dict_df.data_coding)].index.intersection(df.columns)\n",
" if len(fields_for_translation) == 0:\n",
" if len(fields_for_translation) == 0: # No fields with data_coding code\n",
" return df\n",
" \n",
" if transform_from == transform_to: \n",
" df = convert_codings_to_int(df=df, dict_df=dict_df, fields_for_translation=fields_for_translation, \n",
" preferred_language=transform_to)\n",
" \n",
" return df\n",
" \n",
" \n",
" transformed_df = df.copy()\n",
" for column in fields_for_translation:\n",
" data_coding = dict_df.loc[column, 'data_coding']\n",
Expand All @@ -268,15 +291,11 @@
" dict_df,\n",
" mapping_df\n",
" )\n",
" \n",
" transformed_df = convert_codings_to_int(df=transformed_df, dict_df=dict_df, fields_for_translation=fields_for_translation, \n",
" preferred_language=transform_to)\n",
" return transformed_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading
Loading