Skip to content

Commit

Permalink
Coding fix (#32)
Browse files Browse the repository at this point in the history
* convert to int

* suport for non multiple array

* refactored: convert_codings_to_int

* added: ordered categories sorted by codings

* small refactor

* fixed: ensure ordered coding categories remain ordered on get()

* added: PhenoLoader data_codings property indexed by field name

* improved: PhenoLoader: removed load_func deprecation warning

* improved: handle missing data_coding column

* chore: nb cleanup

* fixed: example data dict

---------

Co-authored-by: Maria Gorodetski <[email protected]>
Co-authored-by: alondmnt <[email protected]>
Co-authored-by: Alon Diament <[email protected]>
  • Loading branch information
4 people authored Nov 12, 2024
1 parent ff301ee commit d2fee91
Show file tree
Hide file tree
Showing 11 changed files with 242 additions and 168 deletions.
22 changes: 11 additions & 11 deletions nbs/01_basic_plots.ipynb

Large diffs are not rendered by default.

18 changes: 15 additions & 3 deletions nbs/05_pheno_loader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -284,9 +284,7 @@
" return None\n",
"\n",
" # load data\n",
" if load_func is not None:\n",
" warnings.warn(\"The 'load_func' is deprecated and will be removed in future versions.\")\n",
" else: \n",
" if load_func is None:\n",
" if 'field_type' not in self.dict:\n",
" field_type = None\n",
" else:\n",
Expand Down Expand Up @@ -740,6 +738,20 @@
" self.fields |= set(self.dfs[table_name].columns.tolist())\n",
" self.fields = sorted(list(self.fields))\n",
"\n",
" # Merge the data_codings dataframe with the dictionary dataframe\n",
" if 'data_coding' in self.dict.columns:\n",
" self.data_codings = self.dict\\\n",
" ['data_coding']\\\n",
" .reset_index()\\\n",
" .rename(columns={'data_coding': 'code_number'})\\\n",
" .astype({'code_number': 'str'})\\\n",
" .merge(\n",
" self.data_codings.astype({'code_number': 'str'}), \n",
" on='code_number',\n",
" how='inner'\n",
" )\\\n",
" .set_index('tabular_field_name')\n",
"\n",
" def __load_one_dataframe__(self, relative_location: str) -> pd.DataFrame:\n",
" \"\"\"\n",
" Load one dataframe.\n",
Expand Down
9 changes: 0 additions & 9 deletions nbs/12_cohort_selector.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,6 @@
"For example, the following query selects participants who have moderate obstructive sleep apnea (AHI > 15) based on recordings of at least 4 hours of sleep."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ml = MetaLoader()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
100 changes: 63 additions & 37 deletions nbs/13_questionnaire_handler.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"metadata": {},
"outputs": [],
"source": [
"#| default_exp questionnaires_handler\n"
"#| default_exp questionnaires_handler"
]
},
{
Expand Down Expand Up @@ -204,40 +204,55 @@
" else:\n",
" code_string = convert_to_string(dict_df.loc[tab_field_name][\"data_coding\"])\n",
" \n",
" #getting the data coding df from the large data coding csv\n",
" # Getting the data coding df from the large data coding csv\n",
" code_df = mapping_df[mapping_df[\"code_number\"] == code_string].copy()\n",
" \n",
" #Make sure no leading 0s for coding values\n",
" # Make sure no leading 0s for coding values\n",
" code_df[\"coding\"] = code_df[\"coding\"].apply(convert_to_string)\n",
" cat_ordered = code_df\\\n",
" .astype({'coding': 'int'})\\\n",
" .sort_values('coding')[code_to]\\\n",
" .astype('str')\\\n",
" .drop_duplicates()\n",
" \n",
" mapping_dict = dict(zip(code_df[code_from].astype(str), code_df[code_to]))\n",
" \n",
" \n",
" #adding fail safe incase older dictionaries don't have field type : TODO potentaily remove once older dictionaires are updated\n",
" if 'field_type' in dict_df.columns:\n",
" field_type = dict_df.loc[tab_field_name]['field_type']\n",
" if isinstance(field_type, pd.Series):\n",
" if field_type.nunique() == 1:\n",
" field_type = field_type.iloc[0]\n",
" else:\n",
" warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
" return orig_answer\n",
" \n",
" if field_type == 'Categorical (multiple)': \n",
" normalise_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalise_answer , code_df)\n",
" transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
" field_type = dict_df.loc[tab_field_name]['field_type']\n",
" if isinstance(field_type, pd.Series):\n",
" if field_type.nunique() == 1:\n",
" field_type = field_type.iloc[0]\n",
" else:\n",
" #if categorical single\n",
" normalized_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalized_answer, code_df)\n",
" transformed_answer = normalized_answer.replace(mapping_dict)\n",
" transformed_answer = transformed_answer.astype(\"category\")\n",
"\n",
" return transformed_answer\n",
" warnings.warn(f\"tabular field {tab_field_name} is used in 2 columns and have conflicting field types,please check and update dictionary. This field has not be converted.\")\n",
" return orig_answer\n",
" \n",
" if field_type == 'Categorical (multiple)': \n",
" normalise_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalise_answer , code_df)\n",
" transformed_answer = normalise_answer.apply(replace_values, mapping_dict = mapping_dict)\n",
" else:\n",
" return orig_answer\n",
" normalized_answer = normalize_answers(orig_answer, field_type)\n",
" check_invalid_values(normalized_answer, code_df)\n",
" transformed_answer = normalized_answer.replace(mapping_dict)\n",
" transformed_answer = pd.Categorical(transformed_answer,\n",
" categories=cat_ordered, \n",
" ordered=True\n",
" )\n",
" # We update the dictionary to ensure that the categories are not reset later\n",
" dict_df.loc[tab_field_name, 'pandas_dtype'] = 'category_ordered'\n",
"\n",
" return transformed_answer\n",
"\n",
"def convert_codings_to_int(df: pd.Series, dict_df: pd.DataFrame) -> pd.Series:\n",
" tabular_field_name = df.name\n",
" field_array = dict_df.loc[tabular_field_name, 'array']\n",
" if isinstance(field_array, pd.Series):\n",
" field_array = field_array.iloc[0]\n",
" if field_array == 'Multiple':\n",
" return df\n",
" else: \n",
" dict_df.loc[tabular_field_name, 'pandas_dtype'] = 'Int16'\n",
" return df.astype('Int16', errors='ignore')\n",
"\n",
"def transform_dataframe(\n",
" df: pd.DataFrame,\n",
Expand All @@ -246,20 +261,31 @@
" dict_df: pd.DataFrame,\n",
" mapping_df: pd.DataFrame,\n",
") -> pd.DataFrame:\n",
" if 'data_coding' not in dict_df.columns or transform_from == transform_to:\n",
" if 'data_coding' not in dict_df.columns:\n",
" warnings.warn(\"data_coding column not found in dictionary, skipping transformation\")\n",
" return df\n",
" \n",
" # Validate input parameters\n",
" if transform_from not in valid_codings or transform_to not in valid_codings:\n",
" raise ValueError(f\"transform_from and transform_to must be one of {valid_codings}\")\n",
"\n",
" # Only fields with a code in data_coding property will be transformed\n",
" fields_for_translation = dict_df[pd.notna(dict_df.data_coding)].index.intersection(df.columns)\n",
" if len(fields_for_translation) == 0:\n",
" if len(fields_for_translation) == 0: # No fields with data_coding code\n",
" return df\n",
"\n",
" transformed_df = df.copy()\n",
" for column in fields_for_translation:\n",
" data_coding = dict_df.loc[column, 'data_coding']\n",
" # Handle the case where data_coding is a Series (multiple entries)\n",
" if isinstance(data_coding, pd.Series):\n",
" if data_coding.nunique() > 1:\n",
" warnings.warn(f\"Multiple different data_coding values found for column {column}. Using first value.\")\n",
" data_coding = data_coding.iloc[0]\n",
" \n",
" if pd.notna(data_coding):\n",
"\n",
" if pd.isna(data_coding):\n",
" continue\n",
"\n",
" if transform_from != transform_to:\n",
" transformed_df[column] = transform_answers(\n",
" column,\n",
" transformed_df[column],\n",
Expand All @@ -268,15 +294,15 @@
" dict_df,\n",
" mapping_df\n",
" )\n",
" \n",
" if transform_to == 'coding':\n",
" transformed_df[column] = convert_codings_to_int(\n",
" transformed_df[column], \n",
" dict_df=dict_df\n",
" )\n",
"\n",
" return transformed_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading

0 comments on commit d2fee91

Please sign in to comment.