PhenoAI · alondmnt · Nov 12, 2024 · Aug 14, 2024 · Sep 3, 2024 · Nov 8, 2024
diff --git a/nbs/01_basic_plots.ipynb b/nbs/01_basic_plots.ipynb
diff --git a/nbs/13_questionnaire_handler.ipynb b/nbs/13_questionnaire_handler.ipynb
@@ -18,7 +18,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| default_exp questionnaires_handler\n"
+    "#| default_exp questionnaires_handler"
    ]
   },
   {
@@ -204,54 +204,77 @@
     "    else:\n",
     "        code_string = convert_to_string(dict_df.loc[tab_field_name][\"data_coding\"])\n",
     "    \n",
-    "    #getting the data coding df from the large data coding csv\n",
+    "    # Getting the data coding df from the large data coding csv\n",
     "    code_df = mapping_df[mapping_df[\"code_number\"] == code_string].copy()\n",
     "    \n",
-    "    #Make sure no leading 0s for coding values\n",
+    "    # Make sure no leading 0s for coding values\n",
     "    code_df[\"coding\"] =  code_df[\"coding\"].apply(convert_to_string)\n",
     "    \n",
     "    mapping_dict = dict(zip(code_df[code_from].astype(str), code_df[code_to]))\n",
     "    \n",
     "  \n",
-    "  #adding fail safe incase older dictionaries don't have field type : TODO potentaily remove once older dictionaires are updated\n",
-    "    if 'field_type' in dict_df.columns:\n",
-    "        field_type =  dict_df.loc[tab_field_name]['field_type']\n",
-    "        if isinstance(field_type, pd.Series):\n",
-    "            if field_type.nunique() == 1:\n",
-    "                field_type = field_type.iloc[0]\n",
-    "            else:\n",
-    "                warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
-    "                return orig_answer\n",
-    "        \n",
-    "        if field_type == 'Categorical (multiple)': \n",
-    "            normalise_answer = normalize_answers(orig_answer, field_type)\n",
-    "            check_invalid_values(normalise_answer , code_df)\n",
-    "            transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
+    "    field_type =  dict_df.loc[tab_field_name]['field_type']\n",
+    "    if isinstance(field_type, pd.Series):\n",
+    "        if field_type.nunique() == 1:\n",
+    "            field_type = field_type.iloc[0]\n",
     "        else:\n",
-    "            #if categorical single\n",
-    "            normalized_answer = normalize_answers(orig_answer, field_type)\n",
-    "            check_invalid_values(normalized_answer, code_df)\n",
-    "            transformed_answer = normalized_answer.replace(mapping_dict)\n",
-    "            transformed_answer = transformed_answer.astype(\"category\")\n",
-    "\n",
-    "        return transformed_answer\n",
+    "            warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
+    "            return orig_answer\n",
+    "    \n",
+    "    if field_type == 'Categorical (multiple)': \n",
+    "        normalise_answer = normalize_answers(orig_answer, field_type)\n",
+    "        check_invalid_values(normalise_answer , code_df)\n",
+    "        transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
     "    else:\n",
-    "        return orig_answer\n",
-    "\n",
+    "        normalized_answer = normalize_answers(orig_answer, field_type)\n",
+    "        check_invalid_values(normalized_answer, code_df)\n",
+    "        transformed_answer = normalized_answer.replace(mapping_dict)\n",
+    "        transformed_answer = transformed_answer.astype(\"category\")\n",
     "\n",
+    "    return transformed_answer\n",
+    "    \n",
+    "def convert_codings_to_int(df: pd.DataFrame, dict_df: pd.DataFrame, fields_for_translation: list, preferred_language: str) -> pd.DataFrame:\n",
+    "    df_coding = df.copy()\n",
+    "    if preferred_language == 'coding':\n",
+    "        for column in fields_for_translation:\n",
+    "            data_coding = dict_df.loc[column, 'data_coding']\n",
+    "            if isinstance(data_coding, pd.Series): # In case of multiple entries, take the first one\n",
+    "                data_coding = data_coding.iloc[0]\n",
+    "            \n",
+    "            if pd.notna(data_coding):\n",
+    "                field_array = dict_df.loc[column, 'array']\n",
+    "                if isinstance(field_array, pd.Series):\n",
+    "                    field_array = field_array.iloc[0]\n",
+    "                if field_array == 'Multiple':\n",
+    "                    continue\n",
+    "                else: \n",
+    "                    df_coding[column] = df[column].astype('Int16', errors='ignore')\n",
+    "                    dict_df.loc[column, 'pandas_dtype'] = 'Int16'\n",
+    "    \n",
+    "    return df_coding\n",
+    "    \n",
+    "        \n",
+    "                    \n",
     "def transform_dataframe(\n",
     "    df: pd.DataFrame,\n",
     "    transform_from: str,\n",
     "    transform_to: str,\n",
     "    dict_df: pd.DataFrame,\n",
     "    mapping_df: pd.DataFrame,\n",
     ") -> pd.DataFrame:\n",
-    "    if 'data_coding' not in dict_df.columns or transform_from == transform_to:\n",
-    "        return df\n",
     "    \n",
+    "    # Only fields with a code in data_coding property will be transformed\n",
     "    fields_for_translation = dict_df[pd.notna(dict_df.data_coding)].index.intersection(df.columns)\n",
-    "    if len(fields_for_translation) == 0:\n",
+    "    if len(fields_for_translation) == 0: # No fields with data_coding code\n",
+    "        return df\n",
+    "    \n",
+    "    if transform_from == transform_to: \n",
+    "        df = convert_codings_to_int(df=df, dict_df=dict_df, fields_for_translation=fields_for_translation, \n",
+    "                                preferred_language=transform_to)\n",
+    "        \n",
     "        return df\n",
+    "    \n",
+    "    \n",
     "    transformed_df = df.copy()\n",
     "    for column in fields_for_translation:\n",
     "        data_coding = dict_df.loc[column, 'data_coding']\n",
@@ -268,15 +291,11 @@
     "                    dict_df,\n",
     "                    mapping_df\n",
     "                )\n",
+    "    \n",
+    "    transformed_df = convert_codings_to_int(df=transformed_df, dict_df=dict_df, fields_for_translation=fields_for_translation, \n",
+    "                                preferred_language=transform_to)\n",
     "    return transformed_df"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {