From 18042d9faed75f0a9cd81285b0a57cfe4c6e713b Mon Sep 17 00:00:00 2001 From: Sundar Krishnan Date: Fri, 13 Dec 2019 00:00:35 -0500 Subject: [PATCH] Add files via upload --- CONTRIBUTING.md | 7 + LICENSE.md | 21 + README.md | 233 +++ Xverse.ipynb | 4289 ++++++++++++++++++++++++++++++++++++++++++++++ example_woe.csv | 17 + requirements.txt | 5 + setup.py | 30 + 7 files changed, 4602 insertions(+) create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE.md create mode 100644 README.md create mode 100644 Xverse.ipynb create mode 100644 example_woe.csv create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..c5ab3c2 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,7 @@ +# Contributing Code + +This code is under active development. Our goals are to: + +1. Provide a variety of feature engineering techniques +2. Provide a variety of feature transformation techniques +3. Provide a variety of feature selection techniques \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..f51d1db --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019, Sundar Krishnan + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..601cb95 --- /dev/null +++ b/README.md @@ -0,0 +1,233 @@ +# XVerse + +**XVerse** short for **X**-uni**Verse** is a Python module for machine learning in the space of feature engineering, feature transformation and feature selection. + +## Installation + +The package requires `numpy, pandas, scikit-learn, scipy` and `statsmodels`. In addition, the package is tested on Python version 3.5 and above. + +To install the package, download this folder and execute: +```sh +python setup.py install +``` +or +```sh +pip install xverse +``` +or +```sh +conda install -c conda-forge xverse +``` + +## Usage + +XVerse module are fully compatible with sklearn transformers, so they can be used in pipelines or in your existing scripts. Currently, it supports only Pandas dataframes. + +## Example + +### Monotonic Binning (Feature transformation) +```python +from xverse.transformer import MonotonicBinning + +clf = MonotonicBinning() +clf.fit(X, y) + +print(clf.bins) +``` +``` +{'age': array([19., 35., 45., 87.]), + 'balance': array([-3313. , 174. , 979.33333333, 71188. ]), + 'campaign': array([ 1., 3., 50.]), + 'day': array([ 1., 12., 20., 31.]), + 'duration': array([ 4. , 128. , 261.33333333, 3025. ]), + 'pdays': array([-1.00e+00, -5.00e-01, 1.00e+00, 8.71e+02]), + 'previous': array([ 0., 1., 25.])} +``` + +### Weight of Evidence (WOE) and Information Value (IV) (Feature transformation and Selection) +```python +from xverse.transformer import WOE + +clf = WOE() +clf.fit(X, y) + +print(clf.woe_df.head()) #Weight of Evidence transformation dataset +``` +``` ++---+---------------+--------------------+-------+-------+-----------+---------------------+--------------------+---------------------+------------------------+----------------------+---------------------+ +| | Variable_Name | Category | Count | Event | Non_Event | Event_Rate | Non_Event_Rate | Event_Distribution | Non_Event_Distribution | WOE | Information_Value | ++---+---------------+--------------------+-------+-------+-----------+---------------------+--------------------+---------------------+------------------------+----------------------+---------------------+ +| 0 | age | (18.999, 35.0] | 1652 | 197 | 1455 | 0.11924939467312348 | 0.8807506053268765 | 0.3781190019193858 | 0.36375 | 0.038742147481056366 | 0.02469286279236605 | ++---+---------------+--------------------+-------+-------+-----------+---------------------+--------------------+---------------------+------------------------+----------------------+---------------------+ +| 1 | age | (35.0, 45.0] | 1388 | 129 | 1259 | 0.09293948126801153 | 0.9070605187319885 | 0.2476007677543186 | 0.31475 | -0.2399610313340142 | 0.02469286279236605 | ++---+---------------+--------------------+-------+-------+-----------+---------------------+--------------------+---------------------+------------------------+----------------------+---------------------+ +| 2 | age | (45.0, 87.0] | 1481 | 195 | 1286 | 0.13166779203241052 | 0.8683322079675895 | 0.3742802303262956 | 0.3215 | 0.15200725211484276 | 0.02469286279236605 | ++---+---------------+--------------------+-------+-------+-----------+---------------------+--------------------+---------------------+------------------------+----------------------+---------------------+ +| 3 | balance | (-3313.001, 174.0] | 1512 | 133 | 1379 | 0.08796296296296297 | 0.9120370370370371 | 0.255278310940499 | 0.34475 | -0.3004651512228873 | 0.06157421302850976 | ++---+---------------+--------------------+-------+-------+-----------+---------------------+--------------------+---------------------+------------------------+----------------------+---------------------+ +| 4 | balance | (174.0, 979.333] | 1502 | 163 | 1339 | 0.1085219707057257 | 0.8914780292942743 | 0.31285988483685223 | 0.33475 | -0.06762854653574929 | 0.06157421302850976 | ++---+---------------+--------------------+-------+-------+-----------+---------------------+--------------------+---------------------+------------------------+----------------------+---------------------+ +``` +```python +print(clf.iv_df) #Information value dataset +``` +``` ++----+---------------+------------------------+ +| | Variable_Name | Information_Value | ++----+---------------+------------------------+ +| 6 | duration | 1.1606798895024775 | ++----+---------------+------------------------+ +| 14 | poutcome | 0.4618899274360784 | ++----+---------------+------------------------+ +| 12 | month | 0.37953277364723703 | ++----+---------------+------------------------+ +| 3 | contact | 0.2477624664660033 | ++----+---------------+------------------------+ +| 13 | pdays | 0.20326698063078097 | ++----+---------------+------------------------+ +| 15 | previous | 0.1770811514357682 | ++----+---------------+------------------------+ +| 9 | job | 0.13251854742728092 | ++----+---------------+------------------------+ +| 8 | housing | 0.10655553101753026 | ++----+---------------+------------------------+ +| 1 | balance | 0.06157421302850976 | ++----+---------------+------------------------+ +| 10 | loan | 0.06079091829519839 | ++----+---------------+------------------------+ +| 11 | marital | 0.04009032555607127 | ++----+---------------+------------------------+ +| 7 | education | 0.03181211694236827 | ++----+---------------+------------------------+ +| 0 | age | 0.02469286279236605 | ++----+---------------+------------------------+ +| 2 | campaign | 0.019350877455830695 | ++----+---------------+------------------------+ +| 4 | day | 0.0028156288525541884 | ++----+---------------+------------------------+ +| 5 | default | 1.6450124824351054e-05 | ++----+---------------+------------------------+ +``` +#### Apply this handly rule to select variables based on Information value +``` ++-------------------+-----------------------------+ +| Information Value | Variable Predictiveness | ++-------------------+-----------------------------+ +| Less than 0.02 | Not useful for prediction | ++-------------------+-----------------------------+ +| 0.02 to 0.1 | Weak predictive Power | ++-------------------+-----------------------------+ +| 0.1 to 0.3 | Medium predictive Power | ++-------------------+-----------------------------+ +| 0.3 to 0.5 | Strong predictive Power | ++-------------------+-----------------------------+ +| >0.5 | Suspicious Predictive Power | ++-------------------+-----------------------------+ +``` + +```python +clf.transform(X) #apply WOE transformation on the dataset +``` + +### VotingSelector (Feature selection) + +```python +from xverse.ensemble import VotingSelector + +clf = VotingSelector() +clf.fit(X, y) +print(clf.available_techniques) +``` +``` +['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE'] +``` +```python +clf.feature_importances_ +``` +``` ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| | Variable_Name | Information_Value | Random_Forest | Recursive_Feature_Elimination | Extra_Trees | Chi_Square | L_One | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 0 | duration | 1.1606798895024775 | 0.29100016518065835 | 0.0 | 0.24336032789230097 | 62.53045588382914 | 0.0009834060765907017 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 1 | poutcome | 0.4618899274360784 | 0.05975563617541324 | 0.8149539108454378 | 0.07291945099022576 | 209.1788690088815 | 0.27884071686005385 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 2 | month | 0.37953277364723703 | 0.09472524644853274 | 0.6270707318033509 | 0.10303345973615481 | 54.81011477300214 | 0.18763733424335785 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 3 | contact | 0.2477624664660033 | 0.018358265986906014 | 0.45594899004325673 | 0.029325952072445132 | 25.357947712611868 | 0.04876094100065351 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 4 | pdays | 0.20326698063078097 | 0.04927368012222067 | 0.0 | 0.02738001362078519 | 13.808925800391403 | -0.00026932622581396677 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 5 | previous | 0.1770811514357682 | 0.02612886929056733 | 0.0 | 0.027197295919351088 | 13.019278420681164 | 0.0 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 6 | job | 0.13251854742728092 | 0.050024353325485646 | 0.5207956132479409 | 0.05775450997836301 | 13.043319831003855 | 0.11279310830899944 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 7 | housing | 0.10655553101753026 | 0.021126744587568032 | 0.28135643347861894 | 0.020830177741565564 | 28.043094016887064 | 0.0 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 8 | balance | 0.06157421302850976 | 0.0963543249575152 | 0.0 | 0.08429423739161768 | 0.03720300378031974 | -1.3553979494412002e-06 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 9 | loan | 0.06079091829519839 | 0.008783347837152861 | 0.6414812505459246 | 0.013652849211750306 | 3.4361027026756084 | 0.0 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 10 | marital | 0.04009032555607127 | 0.02648832289940045 | 0.9140684291962617 | 0.03929791951230852 | 10.889749514307464 | 0.0 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 11 | education | 0.03181211694236827 | 0.02757205345952717 | 0.21529148795958114 | 0.03980467391633981 | 4.70588768051867 | 0.0 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 12 | age | 0.02469286279236605 | 0.10164634631051869 | 0.0 | 0.08893247762137796 | 0.6818947945319156 | -0.004414426121909251 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 13 | campaign | 0.019350877455830695 | 0.04289312347011537 | 0.0 | 0.05716486374991612 | 1.8596566731099653 | -0.012650844735972498 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 14 | day | 0.0028156288525541884 | 0.083859807784465 | 0.0 | 0.09056623672332145 | 0.08687716739873641 | -0.00231307077371602 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +| 15 | default | 1.6450124824351054e-05 | 0.0020097121639531665 | 0.0 | 0.004485553922176626 | 0.007542737902818529 | 0.0 | ++----+---------------+------------------------+-----------------------+-------------------------------+----------------------+----------------------+-------------------------+ +``` +```python +clf.feature_votes_ +``` +``` ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| | Variable_Name | Information_Value | Random_Forest | Recursive_Feature_Elimination | Extra_Trees | Chi_Square | L_One | Votes | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 1 | poutcome | 1 | 1 | 1 | 1 | 1 | 1 | 6 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 2 | month | 1 | 1 | 1 | 1 | 1 | 1 | 6 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 6 | job | 1 | 1 | 1 | 1 | 1 | 1 | 6 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 0 | duration | 1 | 1 | 0 | 1 | 1 | 1 | 5 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 3 | contact | 1 | 0 | 1 | 0 | 1 | 1 | 4 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 4 | pdays | 1 | 1 | 0 | 0 | 1 | 0 | 3 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 7 | housing | 1 | 0 | 1 | 0 | 1 | 0 | 3 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 12 | age | 0 | 1 | 0 | 1 | 0 | 1 | 3 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 14 | day | 0 | 1 | 0 | 1 | 0 | 1 | 3 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 5 | previous | 1 | 0 | 0 | 0 | 1 | 0 | 2 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 8 | balance | 0 | 1 | 0 | 1 | 0 | 0 | 2 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 13 | campaign | 0 | 0 | 0 | 1 | 0 | 1 | 2 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 9 | loan | 0 | 0 | 1 | 0 | 0 | 0 | 1 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 10 | marital | 0 | 0 | 1 | 0 | 0 | 0 | 1 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 11 | education | 0 | 0 | 1 | 0 | 0 | 0 | 1 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +| 15 | default | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ++----+---------------+-------------------+---------------+-------------------------------+-------------+------------+-------+-------+ +``` +## Contributing +Category encoders is under active development, if you'd like to be involved, we'd love to have you. Check out the CONTRIBUTING.md file or open an issue on the github project to get started. + +## References +https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html + +https://medium.com/@sundarstyles89/variable-selection-using-python-vote-based-approach-faa42da960f0 + + + diff --git a/Xverse.ipynb b/Xverse.ipynb new file mode 100644 index 0000000..64a5af3 --- /dev/null +++ b/Xverse.ipynb @@ -0,0 +1,4289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook walk you through the __xverse__ package in detail. It provides codes for the same, so you can use this as a template to apply on your data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "df = pd.read_csv('./data/bank.csv',sep='|')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agejobmaritaleducationdefaultbalancehousingloancontactdaymonthdurationcampaignpdayspreviouspoutcometarget
030unemployedmarriedprimaryno1787nonocellular19oct791-10unknown0
133servicesmarriedsecondaryno4789yesyescellular11may22013394failure0
235managementsingletertiaryno1350yesnocellular16apr18513301failure0
330managementmarriedtertiaryno1476yesyesunknown3jun1994-10unknown0
459blue-collarmarriedsecondaryno0yesnounknown5may2261-10unknown0
\n", + "
" + ], + "text/plain": [ + " age job marital education default balance housing loan \\\n", + "0 30 unemployed married primary no 1787 no no \n", + "1 33 services married secondary no 4789 yes yes \n", + "2 35 management single tertiary no 1350 yes no \n", + "3 30 management married tertiary no 1476 yes yes \n", + "4 59 blue-collar married secondary no 0 yes no \n", + "\n", + " contact day month duration campaign pdays previous poutcome target \n", + "0 cellular 19 oct 79 1 -1 0 unknown 0 \n", + "1 cellular 11 may 220 1 339 4 failure 0 \n", + "2 cellular 16 apr 185 1 330 1 failure 0 \n", + "3 unknown 3 jun 199 4 -1 0 unknown 0 \n", + "4 unknown 5 may 226 1 -1 0 unknown 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Feature Subset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This option is used to select a subset of features from the dataset. A list of features should be provided to subset. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'target']\n" + ] + } + ], + "source": [ + "from xverse.feature_subset import FeatureSubset\n", + "\n", + "numerical_features = list(df._get_numeric_data().columns)\n", + "categorical_features = list(df.columns.difference(numerical_features))\n", + "print(numerical_features)\n", + "\n", + "clf = FeatureSubset(numerical_features) #select only numeric features\n", + "df = clf.fit_transform(df) #returns the dataframe with selected features" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancedaydurationcampaignpdaysprevioustarget
030178719791-100
133478911220133940
235135016185133010
330147631994-100
459052261-100
\n", + "
" + ], + "text/plain": [ + " age balance day duration campaign pdays previous target\n", + "0 30 1787 19 79 1 -1 0 0\n", + "1 33 4789 11 220 1 339 4 0\n", + "2 35 1350 16 185 1 330 1 0\n", + "3 30 1476 3 199 4 -1 0 0\n", + "4 59 0 5 226 1 -1 0 0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Split X and Y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This option is used to split the dataset into X and y feature. All it needs is the target column as a list. Using the original dataframe again for this exercise." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = pd.read_csv('./data/bank.csv',sep='|')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from xverse.feature_subset import SplitXY\n", + "\n", + "clf = SplitXY(['target']) #Split the dataset into X and y\n", + "X, y = clf.fit_transform(df) #returns features (X) dataset and target(Y) as a numpy array" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancecampaigncontactdaydefaultdurationeducationhousingjobloanmaritalmonthpdayspoutcomeprevious
03017871cellular19no79primarynounemployednomarriedoct-1unknown0
13347891cellular11no220secondaryyesservicesyesmarriedmay339failure4
23513501cellular16no185tertiaryyesmanagementnosingleapr330failure1
33014764unknown3no199tertiaryyesmanagementyesmarriedjun-1unknown0
45901unknown5no226secondaryyesblue-collarnomarriedmay-1unknown0
\n", + "
" + ], + "text/plain": [ + " age balance campaign contact day default duration education housing \\\n", + "0 30 1787 1 cellular 19 no 79 primary no \n", + "1 33 4789 1 cellular 11 no 220 secondary yes \n", + "2 35 1350 1 cellular 16 no 185 tertiary yes \n", + "3 30 1476 4 unknown 3 no 199 tertiary yes \n", + "4 59 0 1 unknown 5 no 226 secondary yes \n", + "\n", + " job loan marital month pdays poutcome previous \n", + "0 unemployed no married oct -1 unknown 0 \n", + "1 services yes married may 339 failure 4 \n", + "2 management no single apr 330 failure 1 \n", + "3 management yes married jun -1 unknown 0 \n", + "4 blue-collar no married may -1 unknown 0 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, ..., 0, 0, 0])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Going forward we will running this function iteratively to demonstrate each of the features. So, I created a prep_dataset option which is shown below. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def prep_dataset():\n", + " \n", + " df = pd.read_csv('./data/bank.csv',sep='|')\n", + "\n", + " from xverse.feature_subset import SplitXY\n", + "\n", + " clf = SplitXY(['target']) #Split the dataset into X and y\n", + " X, y = clf.fit_transform(df) #returns features (X) dataset and target(Y) as a numpy array\n", + " \n", + " return X, y" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# 3. Monotonic Binning for numerical variables" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Monotonically bin numeric variables based on target. The binning operation starts with the \"max_bins\" option. It iterates by reducing the number of bins, until it finds bins with monotonic relationship (either increasing or decreasing) between X and y. If the module is unable to find a monotonic relationship, it forcefully creates bins using the \"force_bins\" option. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, y = prep_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MonotonicBinning(cardinality_cutoff=5, custom_binning=None,\n", + " feature_names='all', force_bins=4, max_bins=20, prefix=None)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from xverse.transformer import MonotonicBinning\n", + "\n", + "clf = MonotonicBinning()\n", + "clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'age': array([19., 35., 45., 87.]),\n", + " 'balance': array([-3313. , 174. , 979.33333333, 71188. ]),\n", + " 'campaign': array([ 1., 3., 50.]),\n", + " 'day': array([ 1., 12., 20., 31.]),\n", + " 'duration': array([ 4. , 128. , 261.33333333, 3025. ]),\n", + " 'pdays': array([-1.00e+00, -5.00e-01, 1.00e+00, 8.71e+02]),\n", + " 'previous': array([ 0., 1., 25.])}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.bins" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "output_bins = clf.bins #will be used later in this exercise" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "out_X = clf.transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancecampaigncontactdaydefaultdurationeducationhousingjobloanmaritalmonthpdayspoutcomeprevious
0(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(12.0, 20.0]no(3.999, 128.0]primarynounemployednomarriedoct(-1.001, -0.5]unknown(-0.001, 1.0]
1(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(0.999, 12.0]no(128.0, 261.333]secondaryyesservicesyesmarriedmay(1.0, 871.0]failure(1.0, 25.0]
2(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(12.0, 20.0]no(128.0, 261.333]tertiaryyesmanagementnosingleapr(1.0, 871.0]failure(-0.001, 1.0]
3(18.999, 35.0](979.333, 71188.0](3.0, 50.0]unknown(0.999, 12.0]no(128.0, 261.333]tertiaryyesmanagementyesmarriedjun(-1.001, -0.5]unknown(-0.001, 1.0]
4(45.0, 87.0](-3313.001, 174.0](0.999, 3.0]unknown(0.999, 12.0]no(128.0, 261.333]secondaryyesblue-collarnomarriedmay(-1.001, -0.5]unknown(-0.001, 1.0]
\n", + "
" + ], + "text/plain": [ + " age balance campaign contact day \\\n", + "0 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (12.0, 20.0] \n", + "1 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (0.999, 12.0] \n", + "2 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (12.0, 20.0] \n", + "3 (18.999, 35.0] (979.333, 71188.0] (3.0, 50.0] unknown (0.999, 12.0] \n", + "4 (45.0, 87.0] (-3313.001, 174.0] (0.999, 3.0] unknown (0.999, 12.0] \n", + "\n", + " default duration education housing job loan marital \\\n", + "0 no (3.999, 128.0] primary no unemployed no married \n", + "1 no (128.0, 261.333] secondary yes services yes married \n", + "2 no (128.0, 261.333] tertiary yes management no single \n", + "3 no (128.0, 261.333] tertiary yes management yes married \n", + "4 no (128.0, 261.333] secondary yes blue-collar no married \n", + "\n", + " month pdays poutcome previous \n", + "0 oct (-1.001, -0.5] unknown (-0.001, 1.0] \n", + "1 may (1.0, 871.0] failure (1.0, 25.0] \n", + "2 apr (1.0, 871.0] failure (-0.001, 1.0] \n", + "3 jun (-1.001, -0.5] unknown (-0.001, 1.0] \n", + "4 may (-1.001, -0.5] unknown (-0.001, 1.0] " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out_X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.1 Available options in the package for Monotonic binning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Parameters\n", + " ----------\n", + " feature_names: 'all' or list (default='all')\n", + " list of features to perform monotonic binning operation. \n", + " - 'all' (default): All features in the dataset will be used\n", + " - list of features: ['age', 'income',......]\n", + " \n", + " max_bins: int (default=20)\n", + " Maximum number of bins that can be created for any given variable. The final number of bins created will be less than or equal to this number.\n", + " \n", + " force_bins: int (default=3)\n", + " It forces the module to create bins for a variable, when it cannot find monotonic relationship using \"max_bins\" option. The final number of bins created will be equal to the number specified.\n", + " \n", + " cardinality_cutoff: int (default=5)\n", + " Cutoff to determine if a variable is eligible for monotonic binning operation. Any variable which has unique levels less than this number will be treated as character variables. At this point no binning operation will be performed on the variable and it will return the unique levels as bins for these variable.\n", + " \n", + " prefix: string (default=None)\n", + " Variable prefix to be used for the column created by monotonic binning. \n", + " \n", + " custom_binning: dict (default=None)\n", + " Dictionary structure - {'feature_name': float list}\n", + " Example - {'age': [0., 1., 2., 3.]}\n", + " Using this parameter, the user can perform custom binning on variables. This parameter is also used to apply previously computed bins for each feature (Score new data). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.2 Using the custom binning option in the future to score new data - Monotonic binning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to apply the bins on a new data, then simply use the transform function with the custom binning option." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, y = prep_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancecampaigncontactdaydefaultdurationeducationhousingjobloanmaritalmonthpdayspoutcomeprevious
0(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(12.0, 20.0]no(3.999, 128.0]primarynounemployednomarriedoct(-1.001, -0.5]unknown(-0.001, 1.0]
1(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(0.999, 12.0]no(128.0, 261.333]secondaryyesservicesyesmarriedmay(1.0, 871.0]failure(1.0, 25.0]
2(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(12.0, 20.0]no(128.0, 261.333]tertiaryyesmanagementnosingleapr(1.0, 871.0]failure(-0.001, 1.0]
3(18.999, 35.0](979.333, 71188.0](3.0, 50.0]unknown(0.999, 12.0]no(128.0, 261.333]tertiaryyesmanagementyesmarriedjun(-1.001, -0.5]unknown(-0.001, 1.0]
4(45.0, 87.0](-3313.001, 174.0](0.999, 3.0]unknown(0.999, 12.0]no(128.0, 261.333]secondaryyesblue-collarnomarriedmay(-1.001, -0.5]unknown(-0.001, 1.0]
\n", + "
" + ], + "text/plain": [ + " age balance campaign contact day \\\n", + "0 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (12.0, 20.0] \n", + "1 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (0.999, 12.0] \n", + "2 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (12.0, 20.0] \n", + "3 (18.999, 35.0] (979.333, 71188.0] (3.0, 50.0] unknown (0.999, 12.0] \n", + "4 (45.0, 87.0] (-3313.001, 174.0] (0.999, 3.0] unknown (0.999, 12.0] \n", + "\n", + " default duration education housing job loan marital \\\n", + "0 no (3.999, 128.0] primary no unemployed no married \n", + "1 no (128.0, 261.333] secondary yes services yes married \n", + "2 no (128.0, 261.333] tertiary yes management no single \n", + "3 no (128.0, 261.333] tertiary yes management yes married \n", + "4 no (128.0, 261.333] secondary yes blue-collar no married \n", + "\n", + " month pdays poutcome previous \n", + "0 oct (-1.001, -0.5] unknown (-0.001, 1.0] \n", + "1 may (1.0, 871.0] failure (1.0, 25.0] \n", + "2 apr (1.0, 871.0] failure (-0.001, 1.0] \n", + "3 jun (-1.001, -0.5] unknown (-0.001, 1.0] \n", + "4 may (-1.001, -0.5] unknown (-0.001, 1.0] " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf = MonotonicBinning(custom_binning=output_bins) #output_bins was created earlier\n", + "\n", + "out_X = clf.transform(X)\n", + "out_X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.3 What happens if my data has missing values?" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, y = prep_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X = X.mask(np.random.random(X.shape) < .1) #introduce some missing values randomly" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancecampaigncontactdaydefaultdurationeducationhousingjobloanmaritalmonthpdayspoutcomeprevious
0(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellularNaNno(3.999, 128.0]primarynounemployednomarriedoct(-1.001, -0.5]NaN(-0.001, 1.0]
1(18.999, 35.0](979.333, 71188.0]NaNcellular(0.999, 12.0]no(128.0, 261.333]secondaryyesservicesyesmarriedNaN(1.0, 871.0]failure(1.0, 25.0]
2(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellularNaNno(128.0, 261.333]tertiaryyesmanagementnosingleapr(1.0, 871.0]failure(-0.001, 1.0]
3NaN(979.333, 71188.0](3.0, 50.0]unknownNaNno(128.0, 261.333]tertiaryyesmanagementyesNaNjun(-1.001, -0.5]unknown(-0.001, 1.0]
4(45.0, 87.0](-3313.001, 174.0](0.999, 3.0]unknown(0.999, 12.0]noNaNsecondaryyesblue-collarnomarriedmay(-1.001, -0.5]unknown(-0.001, 1.0]
\n", + "
" + ], + "text/plain": [ + " age balance campaign contact day \\\n", + "0 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular NaN \n", + "1 (18.999, 35.0] (979.333, 71188.0] NaN cellular (0.999, 12.0] \n", + "2 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular NaN \n", + "3 NaN (979.333, 71188.0] (3.0, 50.0] unknown NaN \n", + "4 (45.0, 87.0] (-3313.001, 174.0] (0.999, 3.0] unknown (0.999, 12.0] \n", + "\n", + " default duration education housing job loan marital \\\n", + "0 no (3.999, 128.0] primary no unemployed no married \n", + "1 no (128.0, 261.333] secondary yes services yes married \n", + "2 no (128.0, 261.333] tertiary yes management no single \n", + "3 no (128.0, 261.333] tertiary yes management yes NaN \n", + "4 no NaN secondary yes blue-collar no married \n", + "\n", + " month pdays poutcome previous \n", + "0 oct (-1.001, -0.5] NaN (-0.001, 1.0] \n", + "1 NaN (1.0, 871.0] failure (1.0, 25.0] \n", + "2 apr (1.0, 871.0] failure (-0.001, 1.0] \n", + "3 jun (-1.001, -0.5] unknown (-0.001, 1.0] \n", + "4 may (-1.001, -0.5] unknown (-0.001, 1.0] " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out_X = clf.transform(X)\n", + "out_X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "As you see above, for the 'balance' feature, the missing value is not considered for binning operation. So, the output dataset will still have missing values. It is advised to impute missing values before you use this operation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# 4. Weight of Evidence" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, y = prep_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "WOE(exclude_features=None, feature_names='all', mono_cardinality_cutoff=5,\n", + " mono_custom_binning={'pdays': array([-1.00e+00, -5.00e-01, 1.00e+00, 8.71e+02]), 'previous': array([ 0., 1., 25.]), 'age': array([19., 35., 45., 87.]), 'day': array([ 1., 12., 20., 31.]), 'duration': array([ 4. , 128. , 261.33333, 3025. ]), 'balance': array([-3313. , 174. , 979.33333, 71188. ]), 'campaign': array([ 1., 3., 50.])},\n", + " mono_feature_names='all', mono_force_bins=3, mono_max_bins=20,\n", + " mono_prefix=None, monotonic_binning=True, treat_missing='separate',\n", + " woe_bins={'contact': {'cellular': 0.2529710194508961, 'telephone': 0.273413147371702, 'unknown': -0.992071659828519}, 'loan': {'no': 0.09059786923814311, 'yes': -0.6743909823100516}, 'default': {'no': -0.0005335490436409652, 'yes': 0.030831556293914045}, 'marital': {'divorced': 0.27063768069966615, ..., closed='right'): 0.024262753694244996, Interval(0.999, 12.0, closed='right'): 0.0477170092417549}},\n", + " woe_prefix=None)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from xverse.transformer import WOE\n", + "\n", + "clf = WOE()\n", + "clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Variable_NameCategoryCountEventNon_EventEvent_RateNon_Event_RateEvent_DistributionNon_Event_DistributionWOEInformation_Value
0age(18.999, 35.0]165219714550.1192490.8807510.3781190.363750.0387420.024693
1age(35.0, 45.0]138812912590.0929390.9070610.2476010.31475-0.2399610.024693
2age(45.0, 87.0]148119512860.1316680.8683320.3742800.321500.1520070.024693
3balance(-3313.001, 174.0]151213313790.0879630.9120370.2552780.34475-0.3004650.061574
4balance(174.0, 979.333]150216313390.1085220.8914780.3128600.33475-0.0676290.061574
5balance(979.333, 71188.0]150722512820.1493030.8506970.4318620.320500.2982230.061574
6campaign(0.999, 3.0]355643531210.1223280.8776720.8349330.780250.0677370.019351
7campaign(3.0, 50.0]965868790.0891190.9108810.1650670.21975-0.2861380.019351
8contactcellular289641624800.1436460.8563540.7984640.620000.2529710.247762
9contacttelephone301442570.1461790.8538210.0844530.064250.2734130.247762
\n", + "
" + ], + "text/plain": [ + " Variable_Name Category Count Event Non_Event Event_Rate \\\n", + "0 age (18.999, 35.0] 1652 197 1455 0.119249 \n", + "1 age (35.0, 45.0] 1388 129 1259 0.092939 \n", + "2 age (45.0, 87.0] 1481 195 1286 0.131668 \n", + "3 balance (-3313.001, 174.0] 1512 133 1379 0.087963 \n", + "4 balance (174.0, 979.333] 1502 163 1339 0.108522 \n", + "5 balance (979.333, 71188.0] 1507 225 1282 0.149303 \n", + "6 campaign (0.999, 3.0] 3556 435 3121 0.122328 \n", + "7 campaign (3.0, 50.0] 965 86 879 0.089119 \n", + "8 contact cellular 2896 416 2480 0.143646 \n", + "9 contact telephone 301 44 257 0.146179 \n", + "\n", + " Non_Event_Rate Event_Distribution Non_Event_Distribution WOE \\\n", + "0 0.880751 0.378119 0.36375 0.038742 \n", + "1 0.907061 0.247601 0.31475 -0.239961 \n", + "2 0.868332 0.374280 0.32150 0.152007 \n", + "3 0.912037 0.255278 0.34475 -0.300465 \n", + "4 0.891478 0.312860 0.33475 -0.067629 \n", + "5 0.850697 0.431862 0.32050 0.298223 \n", + "6 0.877672 0.834933 0.78025 0.067737 \n", + "7 0.910881 0.165067 0.21975 -0.286138 \n", + "8 0.856354 0.798464 0.62000 0.252971 \n", + "9 0.853821 0.084453 0.06425 0.273413 \n", + "\n", + " Information_Value \n", + "0 0.024693 \n", + "1 0.024693 \n", + "2 0.024693 \n", + "3 0.061574 \n", + "4 0.061574 \n", + "5 0.061574 \n", + "6 0.019351 \n", + "7 0.019351 \n", + "8 0.247762 \n", + "9 0.247762 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.woe_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Variable_NameInformation_Value
6duration1.160680
14poutcome0.461890
12month0.379533
3contact0.247762
13pdays0.203267
15previous0.177081
9job0.132519
8housing0.106556
1balance0.061574
10loan0.060791
11marital0.040090
7education0.031812
0age0.024693
2campaign0.019351
4day0.002816
5default0.000016
\n", + "
" + ], + "text/plain": [ + " Variable_Name Information_Value\n", + "6 duration 1.160680\n", + "14 poutcome 0.461890\n", + "12 month 0.379533\n", + "3 contact 0.247762\n", + "13 pdays 0.203267\n", + "15 previous 0.177081\n", + "9 job 0.132519\n", + "8 housing 0.106556\n", + "1 balance 0.061574\n", + "10 loan 0.060791\n", + "11 marital 0.040090\n", + "7 education 0.031812\n", + "0 age 0.024693\n", + "2 campaign 0.019351\n", + "4 day 0.002816\n", + "5 default 0.000016" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.iv_df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "output_woe_bins = clf.woe_bins" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'age': {Interval(18.999, 35.0, closed='right'): 0.038742147481056366,\n", + " Interval(35.0, 45.0, closed='right'): -0.2399610313340142,\n", + " Interval(45.0, 87.0, closed='right'): 0.15200725211484276},\n", + " 'balance': {Interval(-3313.001, 174.0, closed='right'): -0.3004651512228873,\n", + " Interval(174.0, 979.333, closed='right'): -0.06762854653574929,\n", + " Interval(979.333, 71188.0, closed='right'): 0.2982233630724655},\n", + " 'campaign': {Interval(0.999, 3.0, closed='right'): 0.06773688716656528,\n", + " Interval(3.0, 50.0, closed='right'): -0.28613800308300846},\n", + " 'contact': {'cellular': 0.2529710194508961,\n", + " 'telephone': 0.273413147371702,\n", + " 'unknown': -0.992071659828519},\n", + " 'day': {Interval(0.999, 12.0, closed='right'): 0.0477170092417549,\n", + " Interval(12.0, 20.0, closed='right'): -0.07298254918683383,\n", + " Interval(20.0, 31.0, closed='right'): 0.024262753694244996},\n", + " 'default': {'no': -0.0005335490436409652, 'yes': 0.030831556293914045},\n", + " 'duration': {Interval(3.999, 128.0, closed='right'): -1.9750759013397727,\n", + " Interval(128.0, 261.333, closed='right'): -0.5230487469392816,\n", + " Interval(261.333, 3025.0, closed='right'): 0.9756403270893023},\n", + " 'education': {'primary': -0.22281224643880992,\n", + " 'secondary': -0.0913887719970712,\n", + " 'tertiary': 0.24740406005986976,\n", + " 'unknown': -0.14122540188815785},\n", + " 'housing': {'no': 0.3302347534842417, 'yes': -0.32555162188315667},\n", + " 'job': {'admin.': 0.05848789761766609,\n", + " 'blue-collar': -0.5041008894262629,\n", + " 'entrepreneur': -0.2840881219415645,\n", + " 'housemaid': 0.09238944929334744,\n", + " 'management': 0.1824788210677292,\n", + " 'retired': 0.8567996498747835,\n", + " 'self-employed': -0.05971832890411075,\n", + " 'services': -0.2616504470073798,\n", + " 'student': 0.808351307619464,\n", + " 'technician': -0.07227863211696668,\n", + " 'unemployed': -0.14168317255305263,\n", + " 'unknown': 0.5502225429188278},\n", + " 'loan': {'no': 0.09059786923814311, 'yes': -0.6743909823100516},\n", + " 'marital': {'divorced': 0.27063768069966615,\n", + " 'married': -0.16969707596946979,\n", + " 'single': 0.21995067493136639},\n", + " 'month': {'apr': 0.5955911479486788,\n", + " 'aug': 0.09058276406839828,\n", + " 'dec': 1.8376289028865098,\n", + " 'feb': 0.4609500004660608,\n", + " 'jan': -0.071913601997929,\n", + " 'jul': -0.32007685427380045,\n", + " 'jun': -0.11978507065028823,\n", + " 'mar': 1.75061752589688,\n", + " 'may': -0.6030592282548771,\n", + " 'nov': -0.15607191000515222,\n", + " 'oct': 1.8880173952993229,\n", + " 'sep': 1.316164880915463},\n", + " 'pdays': {Interval(-1.001, -0.5, closed='right'): -0.26369184666119494,\n", + " Interval(-0.5, 1.0, closed='right'): 0.0,\n", + " Interval(1.0, 871.0, closed='right'): 0.7934168912785986},\n", + " 'poutcome': {'failure': 0.12465031151156883,\n", + " 'other': 0.606981555854815,\n", + " 'success': 2.628498809656164,\n", + " 'unknown': -0.26369184666119494},\n", + " 'previous': {Interval(-0.001, 1.0, closed='right'): -0.19021717277615902,\n", + " Interval(1.0, 25.0, closed='right'): 0.944712445883224}}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_woe_bins" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "output_mono_bins = clf.mono_custom_binning" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'age': array([19., 35., 45., 87.]),\n", + " 'balance': array([-3313. , 174. , 979.33333333, 71188. ]),\n", + " 'campaign': array([ 1., 3., 50.]),\n", + " 'day': array([ 1., 12., 20., 31.]),\n", + " 'duration': array([ 4. , 128. , 261.33333333, 3025. ]),\n", + " 'pdays': array([-1.00e+00, -5.00e-01, 1.00e+00, 8.71e+02]),\n", + " 'previous': array([ 0., 1., 25.])}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_mono_bins" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancecampaigncontactdaydefaultdurationeducationhousingjobloanmaritalmonthpdayspoutcomeprevious
00.0387420.2982230.0677370.252971-0.072983-0.000534-1.975076-0.2228120.330235-0.1416830.090598-0.1696971.888017-0.263692-0.263692-0.190217
10.0387420.2982230.0677370.2529710.047717-0.000534-0.523049-0.091389-0.325552-0.261650-0.674391-0.169697-0.6030590.7934170.1246500.944712
20.0387420.2982230.0677370.252971-0.072983-0.000534-0.5230490.247404-0.3255520.1824790.0905980.2199510.5955910.7934170.124650-0.190217
30.0387420.298223-0.286138-0.9920720.047717-0.000534-0.5230490.247404-0.3255520.182479-0.674391-0.169697-0.119785-0.263692-0.263692-0.190217
40.152007-0.3004650.067737-0.9920720.047717-0.000534-0.523049-0.091389-0.325552-0.5041010.090598-0.169697-0.603059-0.263692-0.263692-0.190217
\n", + "
" + ], + "text/plain": [ + " age balance campaign contact day default duration \\\n", + "0 0.038742 0.298223 0.067737 0.252971 -0.072983 -0.000534 -1.975076 \n", + "1 0.038742 0.298223 0.067737 0.252971 0.047717 -0.000534 -0.523049 \n", + "2 0.038742 0.298223 0.067737 0.252971 -0.072983 -0.000534 -0.523049 \n", + "3 0.038742 0.298223 -0.286138 -0.992072 0.047717 -0.000534 -0.523049 \n", + "4 0.152007 -0.300465 0.067737 -0.992072 0.047717 -0.000534 -0.523049 \n", + "\n", + " education housing job loan marital month pdays \\\n", + "0 -0.222812 0.330235 -0.141683 0.090598 -0.169697 1.888017 -0.263692 \n", + "1 -0.091389 -0.325552 -0.261650 -0.674391 -0.169697 -0.603059 0.793417 \n", + "2 0.247404 -0.325552 0.182479 0.090598 0.219951 0.595591 0.793417 \n", + "3 0.247404 -0.325552 0.182479 -0.674391 -0.169697 -0.119785 -0.263692 \n", + "4 -0.091389 -0.325552 -0.504101 0.090598 -0.169697 -0.603059 -0.263692 \n", + "\n", + " poutcome previous \n", + "0 -0.263692 -0.190217 \n", + "1 0.124650 0.944712 \n", + "2 0.124650 -0.190217 \n", + "3 -0.263692 -0.190217 \n", + "4 -0.263692 -0.190217 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.transform(X).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## 4.1 Available options in the package for WOE" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + " Parameters\n", + " ----------\n", + " feature_names: 'all' or list (default='all')\n", + " list of features to perform WOE transformation. \n", + " - 'all' (default): All categorical features in the dataset will be used\n", + " - list of features: ['age', 'income',......]\n", + " \n", + " exclude_features: list (default=None)\n", + " list of features to be excluded from WOE transformation.\n", + " - Example - ['age', 'income', .......]\n", + " \n", + " woe_prefix: string (default=None)\n", + " Variable prefix to be used for the column created by WOE transformer. The default value is set 'None'. \n", + " \n", + " treat_missing: {'separate', 'mode', 'least_frequent'} (default='separate')\n", + " This parameter setting is used to handle missing values in the dataset.\n", + " 'separate' - Missing values are treated as a own group (category)\n", + " 'mode' - Missing values are combined with the highest frequent item in the dataset\n", + " 'least_frequent' - Missing values are combined with the least frequent item in the dataset\n", + " \n", + " woe_bins: dict of dicts(default=None)\n", + " This feature is added as part of future WOE transformations or scoring. If this value is set, then WOE values provided for each of the features here will be used for transformation. Applicable only in the transform method. \n", + " Dictionary structure - {'feature_name': float list}\n", + " Example - {'education': {'primary' : 0.1, 'tertiary' : 0.5, 'secondary', 0.7}}\n", + " \n", + " monotonic_binning: bool (default=True)\n", + " This parameter is used to perform monotonic binning on numeric variables. If set to False, numeric variables would be ignored.\n", + " \n", + " mono_feature_names: 'all' or list (default='all')\n", + " list of features to perform monotonic binning operation. \n", + " - 'all' (default): All features in the dataset will be used\n", + " - list of features: ['age', 'income',......]\n", + " \n", + " mono_max_bins: int (default=20)\n", + " Maximum number of bins that can be created for any given variable. The final number of bins created will be less than or equal to this number.\n", + " \n", + " mono_force_bins: int (default=3)\n", + " It forces the module to create bins for a variable, when it cannot find monotonic relationship using \"max_bins\" option. The final number of bins created will be equal to the number specified.\n", + " \n", + " mono_cardinality_cutoff: int (default=5)\n", + " Cutoff to determine if a variable is eligible for monotonic binning operation. Any variable which has unique levels less than this number will be treated as character variables. At this point no binning operation will be performed on the variable and it will return the unique levels as bins for these variable.\n", + " \n", + " mono_prefix: string (default=None)\n", + " Variable prefix to be used for the column created by monotonic binning. \n", + " \n", + " mono_custom_binning: dict (default=None)\n", + " Using this parameter, the user can perform custom binning on variables. This parameter is also used to apply previously computed bins for each feature (Score new data).\n", + " Dictionary structure - {'feature_name': float list}\n", + " Example - {'age': [0., 1., 2., 3.]}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## 4.2 Using the custom binning option in the future to score new data - WOE" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "If you want to apply the bins on a new data, then simply use the transform function with the binning option available in WOE." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, y = prep_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancecampaigncontactdaydefaultdurationeducationhousingjobloanmaritalmonthpdayspoutcomeprevious
00.0387420.2982230.0677370.252971-0.072983-0.000534-1.975076-0.2228120.330235-0.1416830.090598-0.1696971.888017-0.263692-0.263692-0.190217
10.0387420.2982230.0677370.2529710.047717-0.000534-0.523049-0.091389-0.325552-0.261650-0.674391-0.169697-0.6030590.7934170.1246500.944712
20.0387420.2982230.0677370.252971-0.072983-0.000534-0.5230490.247404-0.3255520.1824790.0905980.2199510.5955910.7934170.124650-0.190217
30.0387420.298223-0.286138-0.9920720.047717-0.000534-0.5230490.247404-0.3255520.182479-0.674391-0.169697-0.119785-0.263692-0.263692-0.190217
40.152007-0.3004650.067737-0.9920720.047717-0.000534-0.523049-0.091389-0.325552-0.5041010.090598-0.169697-0.603059-0.263692-0.263692-0.190217
\n", + "
" + ], + "text/plain": [ + " age balance campaign contact day default duration \\\n", + "0 0.038742 0.298223 0.067737 0.252971 -0.072983 -0.000534 -1.975076 \n", + "1 0.038742 0.298223 0.067737 0.252971 0.047717 -0.000534 -0.523049 \n", + "2 0.038742 0.298223 0.067737 0.252971 -0.072983 -0.000534 -0.523049 \n", + "3 0.038742 0.298223 -0.286138 -0.992072 0.047717 -0.000534 -0.523049 \n", + "4 0.152007 -0.300465 0.067737 -0.992072 0.047717 -0.000534 -0.523049 \n", + "\n", + " education housing job loan marital month pdays \\\n", + "0 -0.222812 0.330235 -0.141683 0.090598 -0.169697 1.888017 -0.263692 \n", + "1 -0.091389 -0.325552 -0.261650 -0.674391 -0.169697 -0.603059 0.793417 \n", + "2 0.247404 -0.325552 0.182479 0.090598 0.219951 0.595591 0.793417 \n", + "3 0.247404 -0.325552 0.182479 -0.674391 -0.169697 -0.119785 -0.263692 \n", + "4 -0.091389 -0.325552 -0.504101 0.090598 -0.169697 -0.603059 -0.263692 \n", + "\n", + " poutcome previous \n", + "0 -0.263692 -0.190217 \n", + "1 0.124650 0.944712 \n", + "2 0.124650 -0.190217 \n", + "3 -0.263692 -0.190217 \n", + "4 -0.263692 -0.190217 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf = WOE(woe_bins=output_woe_bins, mono_custom_binning=output_mono_bins) #output_bins was created earlier\n", + "\n", + "out_X = clf.transform(X)\n", + "out_X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## 4.3 What happens if my data has missing values?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use one of the options below to handle missing values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + " Parameters\n", + " ----------\n", + " treat_missing: {'separate', 'mode', 'least_frequent'} (default='separate')\n", + " This parameter setting is used to handle missing values in the dataset.\n", + " 'separate' - Missing values are treated as a own group (category)\n", + " 'mode' - Missing values are combined with the highest frequent item in the dataset\n", + " 'least_frequent' - Missing values are combined with the least frequent item in the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, y = prep_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "clf = WOE(treat_missing='mode')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "WOE(exclude_features=None, feature_names='all', mono_cardinality_cutoff=5,\n", + " mono_custom_binning={'pdays': array([-1.00e+00, -5.00e-01, 1.00e+00, 8.71e+02]), 'previous': array([ 0., 1., 25.]), 'age': array([19., 35., 45., 87.]), 'day': array([ 1., 12., 20., 31.]), 'duration': array([ 4. , 128. , 261.33333, 3025. ]), 'balance': array([-3313. , 174. , 979.33333, 71188. ]), 'campaign': array([ 1., 3., 50.])},\n", + " mono_feature_names='all', mono_force_bins=3, mono_max_bins=20,\n", + " mono_prefix=None, monotonic_binning=True, treat_missing='mode',\n", + " woe_bins={'contact': {'cellular': 0.2529710194508961, 'telephone': 0.273413147371702, 'unknown': -0.992071659828519}, 'loan': {'no': 0.09059786923814311, 'yes': -0.6743909823100516}, 'default': {'no': -0.0005335490436409652, 'yes': 0.030831556293914045}, 'marital': {'divorced': 0.27063768069966615, ..., closed='right'): 0.024262753694244996, Interval(0.999, 12.0, closed='right'): 0.0477170092417549}},\n", + " woe_prefix=None)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancecampaigncontactdaydefaultdurationeducationhousingjobloanmaritalmonthpdayspoutcomeprevious
00.0387420.2982230.0677370.252971-0.072983-0.000534-1.975076-0.2228120.330235-0.1416830.090598-0.1696971.888017-0.263692-0.263692-0.190217
10.0387420.2982230.0677370.2529710.047717-0.000534-0.523049-0.091389-0.325552-0.261650-0.674391-0.169697-0.6030590.7934170.1246500.944712
20.0387420.2982230.0677370.252971-0.072983-0.000534-0.5230490.247404-0.3255520.1824790.0905980.2199510.5955910.7934170.124650-0.190217
30.0387420.298223-0.286138-0.9920720.047717-0.000534-0.5230490.247404-0.3255520.182479-0.674391-0.169697-0.119785-0.263692-0.263692-0.190217
40.152007-0.3004650.067737-0.9920720.047717-0.000534-0.523049-0.091389-0.325552-0.5041010.090598-0.169697-0.603059-0.263692-0.263692-0.190217
\n", + "
" + ], + "text/plain": [ + " age balance campaign contact day default duration \\\n", + "0 0.038742 0.298223 0.067737 0.252971 -0.072983 -0.000534 -1.975076 \n", + "1 0.038742 0.298223 0.067737 0.252971 0.047717 -0.000534 -0.523049 \n", + "2 0.038742 0.298223 0.067737 0.252971 -0.072983 -0.000534 -0.523049 \n", + "3 0.038742 0.298223 -0.286138 -0.992072 0.047717 -0.000534 -0.523049 \n", + "4 0.152007 -0.300465 0.067737 -0.992072 0.047717 -0.000534 -0.523049 \n", + "\n", + " education housing job loan marital month pdays \\\n", + "0 -0.222812 0.330235 -0.141683 0.090598 -0.169697 1.888017 -0.263692 \n", + "1 -0.091389 -0.325552 -0.261650 -0.674391 -0.169697 -0.603059 0.793417 \n", + "2 0.247404 -0.325552 0.182479 0.090598 0.219951 0.595591 0.793417 \n", + "3 0.247404 -0.325552 0.182479 -0.674391 -0.169697 -0.119785 -0.263692 \n", + "4 -0.091389 -0.325552 -0.504101 0.090598 -0.169697 -0.603059 -0.263692 \n", + "\n", + " poutcome previous \n", + "0 -0.263692 -0.190217 \n", + "1 0.124650 0.944712 \n", + "2 0.124650 -0.190217 \n", + "3 -0.263692 -0.190217 \n", + "4 -0.263692 -0.190217 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.transform(X).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "clf.woe_prefix = 'woe' #use this if you want to create a new column instead of replacing the existing column" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agebalancecampaigncontactdaydefaultdurationeducationhousingjob...woe_agewoe_durationwoe_balancewoe_previouswoe_jobwoe_housingwoe_poutcomewoe_campaignwoe_monthwoe_day
0(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(12.0, 20.0]no(3.999, 128.0]primarynounemployed...0.038742-1.9750760.298223-0.190217-0.1416830.330235-0.2636920.0677371.888017-0.072983
1(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(0.999, 12.0]no(128.0, 261.333]secondaryyesservices...0.038742-0.5230490.2982230.944712-0.261650-0.3255520.1246500.067737-0.6030590.047717
2(18.999, 35.0](979.333, 71188.0](0.999, 3.0]cellular(12.0, 20.0]no(128.0, 261.333]tertiaryyesmanagement...0.038742-0.5230490.298223-0.1902170.182479-0.3255520.1246500.0677370.595591-0.072983
3(18.999, 35.0](979.333, 71188.0](3.0, 50.0]unknown(0.999, 12.0]no(128.0, 261.333]tertiaryyesmanagement...0.038742-0.5230490.298223-0.1902170.182479-0.325552-0.263692-0.286138-0.1197850.047717
4(45.0, 87.0](-3313.001, 174.0](0.999, 3.0]unknown(0.999, 12.0]no(128.0, 261.333]secondaryyesblue-collar...0.152007-0.523049-0.300465-0.190217-0.504101-0.325552-0.2636920.067737-0.6030590.047717
\n", + "

5 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " age balance campaign contact day \\\n", + "0 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (12.0, 20.0] \n", + "1 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (0.999, 12.0] \n", + "2 (18.999, 35.0] (979.333, 71188.0] (0.999, 3.0] cellular (12.0, 20.0] \n", + "3 (18.999, 35.0] (979.333, 71188.0] (3.0, 50.0] unknown (0.999, 12.0] \n", + "4 (45.0, 87.0] (-3313.001, 174.0] (0.999, 3.0] unknown (0.999, 12.0] \n", + "\n", + " default duration education housing job ... woe_age \\\n", + "0 no (3.999, 128.0] primary no unemployed ... 0.038742 \n", + "1 no (128.0, 261.333] secondary yes services ... 0.038742 \n", + "2 no (128.0, 261.333] tertiary yes management ... 0.038742 \n", + "3 no (128.0, 261.333] tertiary yes management ... 0.038742 \n", + "4 no (128.0, 261.333] secondary yes blue-collar ... 0.152007 \n", + "\n", + " woe_duration woe_balance woe_previous woe_job woe_housing woe_poutcome \\\n", + "0 -1.975076 0.298223 -0.190217 -0.141683 0.330235 -0.263692 \n", + "1 -0.523049 0.298223 0.944712 -0.261650 -0.325552 0.124650 \n", + "2 -0.523049 0.298223 -0.190217 0.182479 -0.325552 0.124650 \n", + "3 -0.523049 0.298223 -0.190217 0.182479 -0.325552 -0.263692 \n", + "4 -0.523049 -0.300465 -0.190217 -0.504101 -0.325552 -0.263692 \n", + "\n", + " woe_campaign woe_month woe_day \n", + "0 0.067737 1.888017 -0.072983 \n", + "1 0.067737 -0.603059 0.047717 \n", + "2 0.067737 0.595591 -0.072983 \n", + "3 -0.286138 -0.119785 0.047717 \n", + "4 0.067737 -0.603059 0.047717 \n", + "\n", + "[5 rows x 32 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.transform(X).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# 5. VotingSelector" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select the input features for a binary model prediction using voting technique. Apply multiple feature selection techniques (Linear and Non linear) on the dataset and calculate the vote secured by all input features for a given binary target." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, y = prep_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "VotingSelector(exclude_features=None, feature_names='all',\n", + " handle_category='woe', minimum_votes=0, no_of_features=8,\n", + " numerical_missing_values='median',\n", + " selection_techniques=['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE'])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from xverse.ensemble import VotingSelector\n", + "\n", + "clf = VotingSelector()\n", + "clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE']" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.available_techniques" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Variable_NameInformation_ValueRandom_ForestRecursive_Feature_EliminationExtra_TreesChi_SquareL_One
0duration1.1606800.2906760.0000000.23976762.5304560.000983
1poutcome0.4618900.0603820.8149540.084037209.1788690.278842
2month0.3795330.0921740.6270710.10317954.8101150.187638
3contact0.2477620.0235810.4559490.02690325.3579480.048762
4pdays0.2032670.0494450.0000000.03401513.808926-0.000269
5previous0.1770810.0171230.0000000.02247313.0192780.000000
6job0.1325190.0483040.5207960.06227713.0433200.112797
7housing0.1065560.0212620.2813560.02288428.0430940.000000
8balance0.0615740.1061340.0000000.0811350.037203-0.000001
9loan0.0607910.0086420.6414810.0117213.4361030.000000
10marital0.0400900.0245700.9140680.03611710.8897500.000000
11education0.0318120.0277270.2152910.0418874.7058880.000000
12age0.0246930.1005300.0000000.0815150.681895-0.004415
13campaign0.0193510.0380360.0000000.0562141.859657-0.012652
14day0.0028160.0877840.0000000.0911160.086877-0.002313
15default0.0000160.0036300.0000000.0047610.0075430.000000
\n", + "
" + ], + "text/plain": [ + " Variable_Name Information_Value Random_Forest \\\n", + "0 duration 1.160680 0.290676 \n", + "1 poutcome 0.461890 0.060382 \n", + "2 month 0.379533 0.092174 \n", + "3 contact 0.247762 0.023581 \n", + "4 pdays 0.203267 0.049445 \n", + "5 previous 0.177081 0.017123 \n", + "6 job 0.132519 0.048304 \n", + "7 housing 0.106556 0.021262 \n", + "8 balance 0.061574 0.106134 \n", + "9 loan 0.060791 0.008642 \n", + "10 marital 0.040090 0.024570 \n", + "11 education 0.031812 0.027727 \n", + "12 age 0.024693 0.100530 \n", + "13 campaign 0.019351 0.038036 \n", + "14 day 0.002816 0.087784 \n", + "15 default 0.000016 0.003630 \n", + "\n", + " Recursive_Feature_Elimination Extra_Trees Chi_Square L_One \n", + "0 0.000000 0.239767 62.530456 0.000983 \n", + "1 0.814954 0.084037 209.178869 0.278842 \n", + "2 0.627071 0.103179 54.810115 0.187638 \n", + "3 0.455949 0.026903 25.357948 0.048762 \n", + "4 0.000000 0.034015 13.808926 -0.000269 \n", + "5 0.000000 0.022473 13.019278 0.000000 \n", + "6 0.520796 0.062277 13.043320 0.112797 \n", + "7 0.281356 0.022884 28.043094 0.000000 \n", + "8 0.000000 0.081135 0.037203 -0.000001 \n", + "9 0.641481 0.011721 3.436103 0.000000 \n", + "10 0.914068 0.036117 10.889750 0.000000 \n", + "11 0.215291 0.041887 4.705888 0.000000 \n", + "12 0.000000 0.081515 0.681895 -0.004415 \n", + "13 0.000000 0.056214 1.859657 -0.012652 \n", + "14 0.000000 0.091116 0.086877 -0.002313 \n", + "15 0.000000 0.004761 0.007543 0.000000 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.feature_importances_" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Variable_NameInformation_ValueRandom_ForestRecursive_Feature_EliminationExtra_TreesChi_SquareL_OneVotes
1poutcome1111116
2month1111116
6job1111116
0duration1101115
3contact1010114
4pdays1100103
7housing1010103
12age0101013
14day0101013
5previous1000102
8balance0101002
13campaign0001012
9loan0010001
10marital0010001
11education0010001
15default0000000
\n", + "
" + ], + "text/plain": [ + " Variable_Name Information_Value Random_Forest \\\n", + "1 poutcome 1 1 \n", + "2 month 1 1 \n", + "6 job 1 1 \n", + "0 duration 1 1 \n", + "3 contact 1 0 \n", + "4 pdays 1 1 \n", + "7 housing 1 0 \n", + "12 age 0 1 \n", + "14 day 0 1 \n", + "5 previous 1 0 \n", + "8 balance 0 1 \n", + "13 campaign 0 0 \n", + "9 loan 0 0 \n", + "10 marital 0 0 \n", + "11 education 0 0 \n", + "15 default 0 0 \n", + "\n", + " Recursive_Feature_Elimination Extra_Trees Chi_Square L_One Votes \n", + "1 1 1 1 1 6 \n", + "2 1 1 1 1 6 \n", + "6 1 1 1 1 6 \n", + "0 0 1 1 1 5 \n", + "3 1 0 1 1 4 \n", + "4 0 0 1 0 3 \n", + "7 1 0 1 0 3 \n", + "12 0 1 0 1 3 \n", + "14 0 1 0 1 3 \n", + "5 0 0 1 0 2 \n", + "8 0 1 0 0 2 \n", + "13 0 1 0 1 2 \n", + "9 1 0 0 0 1 \n", + "10 1 0 0 0 1 \n", + "11 1 0 0 0 1 \n", + "15 0 0 0 0 0 " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.feature_votes_" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
poutcomemonthjobdurationcontactpdayshousingagedaypreviousbalancecampaignloanmaritaleducationdefault
0-0.2636921.888017-0.141683790.252971-10.33023530190178710.090598-0.169697-0.222812-0.000534
10.124650-0.603059-0.2616502200.252971339-0.3255523311447891-0.674391-0.169697-0.091389-0.000534
20.1246500.5955910.1824791850.252971330-0.32555235161135010.0905980.2199510.247404-0.000534
3-0.263692-0.1197850.182479199-0.992072-1-0.325552303014764-0.674391-0.1696970.247404-0.000534
4-0.263692-0.603059-0.504101226-0.992072-1-0.3255525950010.090598-0.169697-0.091389-0.000534
\n", + "
" + ], + "text/plain": [ + " poutcome month job duration contact pdays housing age \\\n", + "0 -0.263692 1.888017 -0.141683 79 0.252971 -1 0.330235 30 \n", + "1 0.124650 -0.603059 -0.261650 220 0.252971 339 -0.325552 33 \n", + "2 0.124650 0.595591 0.182479 185 0.252971 330 -0.325552 35 \n", + "3 -0.263692 -0.119785 0.182479 199 -0.992072 -1 -0.325552 30 \n", + "4 -0.263692 -0.603059 -0.504101 226 -0.992072 -1 -0.325552 59 \n", + "\n", + " day previous balance campaign loan marital education default \n", + "0 19 0 1787 1 0.090598 -0.169697 -0.222812 -0.000534 \n", + "1 11 4 4789 1 -0.674391 -0.169697 -0.091389 -0.000534 \n", + "2 16 1 1350 1 0.090598 0.219951 0.247404 -0.000534 \n", + "3 3 0 1476 4 -0.674391 -0.169697 0.247404 -0.000534 \n", + "4 5 0 0 1 0.090598 -0.169697 -0.091389 -0.000534 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.transform(X).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.1 Available options in the package for VotingSelector" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Parameters\n", + " ----------\n", + " \n", + " feature_names: 'all' or list (default='all')\n", + " list of features to perform WOE transformation. \n", + " 'all' (default) - All categorical features in the dataset will be used\n", + " list of features - ['age', 'income',......]\n", + " \n", + " exclude_features: list (default=None)\n", + " list of features to be excluded from WOE transformation.\n", + " - Example - ['age', 'income', .......]\n", + " \n", + " selection_techniques: 'all', 'quick' or list(default='all')\n", + " List of selection techniques to be applied on the data. Available techniques - Weight of evidence ('WOE'), Random Forest ('RF'), Recursive Feature Elimination ('RFE'), Extra Trees Classifier ('ETC'), Chi Square ('CS'), L1 feature selection ('L_ONE').\n", + " \n", + " 'all' - Apply all selection techniques ['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE']\n", + " 'quick' - ['WOE','RF','ETC']\n", + " list - user provided list of feature selection techniques from available techniques \n", + " \n", + " no_of_featues: 'auto', 'sqrt' or int(default='auto')\n", + " Number of features to be selected by each selection technique.\n", + " 'auto' - len(features)/2\n", + " 'sqrt' - sqrt(len(features)) rounded to the lowest number\n", + " int - user provided number in integer format\n", + " \n", + " handle_category= 'woe' or 'le' (default='woe')\n", + " Handle category values transformation using Label encoder or Weight of Evidence option. Takes care of missing values too. It treats missing values as separate level.\n", + " 'woe' - use weight of evidence transformation\n", + " 'le' - use label encoder transformation\n", + " \n", + " numerical_missing_values= 'median', 'mean' or 0 (default='median')\n", + " Handle numerical variable missing values.\n", + " 'median' - use median of the column\n", + " 'mean' - use mean of the column\n", + " 0 - use 0 to impute the missing values\n", + " \n", + " minimum_votes = int (default=0)\n", + " Minimum number of votes needed to select a variable after feature selection. Only used in the transform process. Default value is set to 0 to select all variables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.2 Future transformation and select variables with minimum number of votes" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "clf.minimum_votes = 3 #select variables whihc got atleast 3 votes" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
poutcomemonthjobdurationcontactpdayshousingageday
0-0.2636921.888017-0.141683790.252971-10.3302353019
10.124650-0.603059-0.2616502200.252971339-0.3255523311
20.1246500.5955910.1824791850.252971330-0.3255523516
3-0.263692-0.1197850.182479199-0.992072-1-0.325552303
4-0.263692-0.603059-0.504101226-0.992072-1-0.325552595
\n", + "
" + ], + "text/plain": [ + " poutcome month job duration contact pdays housing age day\n", + "0 -0.263692 1.888017 -0.141683 79 0.252971 -1 0.330235 30 19\n", + "1 0.124650 -0.603059 -0.261650 220 0.252971 339 -0.325552 33 11\n", + "2 0.124650 0.595591 0.182479 185 0.252971 330 -0.325552 35 16\n", + "3 -0.263692 -0.119785 0.182479 199 -0.992072 -1 -0.325552 30 3\n", + "4 -0.263692 -0.603059 -0.504101 226 -0.992072 -1 -0.325552 59 5" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.transform(X).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.3 Subset feature selection option" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "VotingSelector(exclude_features=None, feature_names='all',\n", + " handle_category='woe', minimum_votes=0, no_of_features=8,\n", + " numerical_missing_values='median',\n", + " selection_techniques=['WOE', 'RF', 'RFE', 'ETC'])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf = VotingSelector(selection_techniques=['WOE', 'RF', 'RFE', 'ETC'])\n", + "clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Variable_NameInformation_ValueRandom_ForestRecursive_Feature_EliminationExtra_TreesVotes
1poutcome11114
2month11114
6job11114
0duration11013
3contact10102
4pdays11002
7housing10102
8balance01012
12age01012
14day01012
5previous10001
9loan00101
10marital00101
11education00101
13campaign00011
15default00000
\n", + "
" + ], + "text/plain": [ + " Variable_Name Information_Value Random_Forest \\\n", + "1 poutcome 1 1 \n", + "2 month 1 1 \n", + "6 job 1 1 \n", + "0 duration 1 1 \n", + "3 contact 1 0 \n", + "4 pdays 1 1 \n", + "7 housing 1 0 \n", + "8 balance 0 1 \n", + "12 age 0 1 \n", + "14 day 0 1 \n", + "5 previous 1 0 \n", + "9 loan 0 0 \n", + "10 marital 0 0 \n", + "11 education 0 0 \n", + "13 campaign 0 0 \n", + "15 default 0 0 \n", + "\n", + " Recursive_Feature_Elimination Extra_Trees Votes \n", + "1 1 1 4 \n", + "2 1 1 4 \n", + "6 1 1 4 \n", + "0 0 1 3 \n", + "3 1 0 2 \n", + "4 0 0 2 \n", + "7 1 0 2 \n", + "8 0 1 2 \n", + "12 0 1 2 \n", + "14 0 1 2 \n", + "5 0 0 1 \n", + "9 1 0 1 \n", + "10 1 0 1 \n", + "11 1 0 1 \n", + "13 0 1 1 \n", + "15 0 0 0 " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.feature_votes_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 6. Pipeline feature" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "\n", + "clf = Pipeline(steps=[('split_x_y', SplitXY(['target'])),('feature_votes', VotingSelector())])" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('split_x_y', SplitXY(feature_names=['target'])), ('feature_votes', VotingSelector(exclude_features=None, feature_names='all',\n", + " handle_category='woe', minimum_votes=0, no_of_features=8,\n", + " numerical_missing_values='median',\n", + " selection_techniques=['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE']))])" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(df, df['target'])" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
poutcomemonthjobdurationcontactpdayshousingagedaypreviousbalancecampaignloanmaritaleducationdefault
0-0.2636921.888017-0.141683790.252971-10.33023530190178710.090598-0.169697-0.222812-0.000534
10.124650-0.603059-0.2616502200.252971339-0.3255523311447891-0.674391-0.169697-0.091389-0.000534
20.1246500.5955910.1824791850.252971330-0.32555235161135010.0905980.2199510.247404-0.000534
3-0.263692-0.1197850.182479199-0.992072-1-0.325552303014764-0.674391-0.1696970.247404-0.000534
4-0.263692-0.603059-0.504101226-0.992072-1-0.3255525950010.090598-0.169697-0.091389-0.000534
\n", + "
" + ], + "text/plain": [ + " poutcome month job duration contact pdays housing age \\\n", + "0 -0.263692 1.888017 -0.141683 79 0.252971 -1 0.330235 30 \n", + "1 0.124650 -0.603059 -0.261650 220 0.252971 339 -0.325552 33 \n", + "2 0.124650 0.595591 0.182479 185 0.252971 330 -0.325552 35 \n", + "3 -0.263692 -0.119785 0.182479 199 -0.992072 -1 -0.325552 30 \n", + "4 -0.263692 -0.603059 -0.504101 226 -0.992072 -1 -0.325552 59 \n", + "\n", + " day previous balance campaign loan marital education default \n", + "0 19 0 1787 1 0.090598 -0.169697 -0.222812 -0.000534 \n", + "1 11 4 4789 1 -0.674391 -0.169697 -0.091389 -0.000534 \n", + "2 16 1 1350 1 0.090598 0.219951 0.247404 -0.000534 \n", + "3 3 0 1476 4 -0.674391 -0.169697 0.247404 -0.000534 \n", + "4 5 0 0 1 0.090598 -0.169697 -0.091389 -0.000534 " + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.transform(df).head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example_woe.csv b/example_woe.csv new file mode 100644 index 0000000..08d3940 --- /dev/null +++ b/example_woe.csv @@ -0,0 +1,17 @@ +,Variable_Name,Information_Value,Random_Forest,Recursive_Feature_Elimination,Extra_Trees,Chi_Square,L_One,Votes +1,poutcome,1,1,1,1,1,1,6 +2,month,1,1,1,1,1,1,6 +6,job,1,1,1,1,1,1,6 +0,duration,1,1,0,1,1,1,5 +3,contact,1,0,1,0,1,1,4 +4,pdays,1,1,0,0,1,0,3 +7,housing,1,0,1,0,1,0,3 +12,age,0,1,0,1,0,1,3 +14,day,0,1,0,1,0,1,3 +5,previous,1,0,0,0,1,0,2 +8,balance,0,1,0,1,0,0,2 +13,campaign,0,0,0,1,0,1,2 +9,loan,0,0,1,0,0,0,1 +10,marital,0,0,1,0,0,0,1 +11,education,0,0,1,0,0,0,1 +15,default,0,0,0,0,0,0,0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a157981 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +numpy>=1.11.3 +scikit-learn>=0.19.0 +scipy>=0.19.0 +statsmodels>=0.6.1 +pandas>=0.21.1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6a3e4f3 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +import setuptools + +with open("README.md", "r") as f: + long_description = f.read() + +setuptools.setup( + name="xverse", # Replace with your own username + version="1.0.0", + author="Sundar Krishnan", + author_email="sundarstyles89@gmail.com", + description="Xverse (X-uniVerse) is collection of transformers for feature engineering and feature selection", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/Sundar0989/xverse", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.5.*', + license='MIT', + install_requires=[ + 'numpy>=1.11.3', + 'scikit-learn>=0.19.0', + 'scipy>=0.19.0', + 'statsmodels>=0.6.1', + 'pandas>=0.21.1' + ] +) \ No newline at end of file