#!/usr/bin/env python
# coding: utf-8

# # Feature Selection for Uplift Modeling
#   
#     
# This notebook includes two sections:  
# - **Feature selection**: demonstrate how to use Filter methods to select the most important numeric features
# - **Performance evaluation**: evaluate the AUUC performance with top features dataset
#   
# *(Paper reference: [Zhao, Zhenyu, et al. "Feature Selection Methods for Uplift Modeling." arXiv preprint arXiv:2005.03447 (2020).](https://arxiv.org/abs/2005.03447))*

# In[1]:


import numpy as np
import pandas as pd


# In[2]:


from causalml.dataset import make_uplift_classification


# #### Import FilterSelect class for Filter methods

# In[3]:


from causalml.feature_selection.filters import FilterSelect


# In[4]:


from causalml.inference.tree import UpliftRandomForestClassifier
from causalml.inference.meta import BaseXRegressor, BaseRRegressor, BaseSRegressor, BaseTRegressor
from causalml.metrics import plot_gain, auuc_score


# In[5]:


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


# In[6]:


import logging

logger = logging.getLogger('causalml')
logging.basicConfig(level=logging.INFO)


# ### Generate dataset
# 
# Generate synthetic data using the built-in function.

# In[7]:


# define parameters for simulation

y_name = 'conversion'
treatment_group_keys = ['control', 'treatment1']
n = 100000
n_classification_features = 50
n_classification_informative = 10
n_classification_repeated = 0
n_uplift_increase_dict = {'treatment1': 8}
n_uplift_decrease_dict = {'treatment1': 4}
delta_uplift_increase_dict = {'treatment1': 0.1}
delta_uplift_decrease_dict = {'treatment1': -0.1}

random_seed = 20200808


# In[8]:


df, X_names = make_uplift_classification(
    treatment_name=treatment_group_keys,
    y_name=y_name,
    n_samples=n,
    n_classification_features=n_classification_features,
    n_classification_informative=n_classification_informative,
    n_classification_repeated=n_classification_repeated,
    n_uplift_increase_dict=n_uplift_increase_dict,
    n_uplift_decrease_dict=n_uplift_decrease_dict,
    delta_uplift_increase_dict = delta_uplift_increase_dict, 
    delta_uplift_decrease_dict = delta_uplift_decrease_dict,
    random_seed=random_seed
)


# In[9]:


df.head()


# In[10]:


# Look at the conversion rate and sample size in each group
df.pivot_table(values='conversion',
               index='treatment_group_key',
               aggfunc=[np.mean, np.size],
               margins=True)


# In[11]:


X_names


# ## Feature selection with Filter methods

# ### method = F (F statistics)

# In[12]:


filter_f = FilterSelect() 


# In[13]:


method = 'F'
f_imp = filter_f.get_importance(df, X_names, y_name, method, 
                      treatment_group = 'treatment1')
print(f_imp)


# ### method = LR (likelihood ratio test)

# In[14]:


method = 'LR'
lr_imp = filter_f.get_importance(df, X_names, y_name, method, 
                      treatment_group = 'treatment1')
print(lr_imp)


# ### method = KL (KL divergence)

# In[15]:


method = 'KL'
kl_imp = filter_f.get_importance(df, X_names, y_name, method, 
                      treatment_group = 'treatment1',
                      n_bins=10)
print(kl_imp)


# We found all these 3 filter methods were able to rank most of the **informative** and **uplift increase** features on the top.

# ## Performance evaluation  
# 
# Evaluate the AUUC (Area Under the Uplift Curve) score with several uplift models when using top features dataset 

# In[16]:


# train test split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111)


# In[17]:


# convert treatment column to 1 (treatment1) and 0 (control)
treatments = np.where((df_test['treatment_group_key']=='treatment1'), 1, 0)
print(treatments[:10])
print(df_test['treatment_group_key'][:10])


# ### Uplift RandomForest Classfier

# In[18]:


uplift_model = UpliftRandomForestClassifier(control_name='control', max_depth=8)


# In[19]:


# using all features
features = X_names 
uplift_model.fit(X = df_train[features].values, 
                 treatment = df_train['treatment_group_key'].values,
                 y = df_train[y_name].values)
y_preds = uplift_model.predict(df_test[features].values)


# ### Select top N features based on KL filter

# In[20]:


top_n = 10
top_10_features = kl_imp['feature'][:top_n]
print(top_10_features)


# In[21]:


top_n = 15
top_15_features = kl_imp['feature'][:top_n]
print(top_15_features)


# In[22]:


top_n = 20
top_20_features = kl_imp['feature'][:top_n]
print(top_20_features)


# #### Train the Uplift model again with top N features

# In[24]:


# using top 10 features
features = top_10_features 

uplift_model.fit(X = df_train[features].values, 
                 treatment = df_train['treatment_group_key'].values,
                 y = df_train[y_name].values)
y_preds_t10 = uplift_model.predict(df_test[features].values)


# In[25]:


# using top 15 features
features = top_15_features 

uplift_model.fit(X = df_train[features].values, 
                 treatment = df_train['treatment_group_key'].values,
                 y = df_train[y_name].values)
y_preds_t15 = uplift_model.predict(df_test[features].values)


# In[26]:


# using top 20 features
features = top_20_features

uplift_model.fit(X = df_train[features].values, 
                 treatment = df_train['treatment_group_key'].values,
                 y = df_train[y_name].values)
y_preds_t20 = uplift_model.predict(df_test[features].values)


# ### Print results for Uplift model

# In[27]:


df_preds = pd.DataFrame([y_preds.ravel(), 
                         y_preds_t10.ravel(),
                         y_preds_t15.ravel(),
                         y_preds_t20.ravel(),
                         treatments,
                         df_test[y_name].ravel()],
                        index=['All', 'Top 10', 'Top 15', 'Top 20', 'is_treated', y_name]).T

plot_gain(df_preds, outcome_col=y_name, treatment_col='is_treated')


# In[28]:


auuc_score(df_preds, outcome_col=y_name, treatment_col='is_treated')


# ### R Learner as base and feed in Random Forest Regressor

# In[29]:


r_rf_learner = BaseRRegressor(
    RandomForestRegressor(
        n_estimators = 100,
        max_depth = 8,
        min_samples_leaf = 100
    ), 
control_name='control') 


# In[30]:


# using all features
features = X_names 
r_rf_learner.fit(X = df_train[features].values, 
                 treatment = df_train['treatment_group_key'].values,
                 y = df_train[y_name].values)
y_preds = r_rf_learner.predict(df_test[features].values)


# In[31]:


# using top 10 features
features = top_10_features 
r_rf_learner.fit(X = df_train[features].values, 
                 treatment = df_train['treatment_group_key'].values,
                 y = df_train[y_name].values)
y_preds_t10 = r_rf_learner.predict(df_test[features].values)


# In[32]:


# using top 15 features
features = top_15_features 
r_rf_learner.fit(X = df_train[features].values, 
                 treatment = df_train['treatment_group_key'].values,
                 y = df_train[y_name].values)
y_preds_t15 = r_rf_learner.predict(df_test[features].values)


# In[33]:


# using top 20 features
features = top_20_features 
r_rf_learner.fit(X = df_train[features].values, 
                 treatment = df_train['treatment_group_key'].values,
                 y = df_train[y_name].values)
y_preds_t20 = r_rf_learner.predict(df_test[features].values)


# ### Print results for R Learner

# In[34]:


df_preds = pd.DataFrame([y_preds.ravel(), 
                         y_preds_t10.ravel(),
                         y_preds_t15.ravel(),
                         y_preds_t20.ravel(),
                         treatments,
                         df_test[y_name].ravel()],
                        index=['All', 'Top 10', 'Top 15', 'Top 20', 'is_treated', y_name]).T

plot_gain(df_preds, outcome_col=y_name, treatment_col='is_treated')


# In[35]:


# print out AUUC score
auuc_score(df_preds, outcome_col=y_name, treatment_col='is_treated')


# (a relatively smaller enhancement on the AUUC is observed in this R Learner case)

# ### S Learner as base and feed in Random Forest Regressor

# In[36]:


slearner_rf = BaseSRegressor(
    RandomForestRegressor(
        n_estimators = 100,
        max_depth = 8,
        min_samples_leaf = 100
    ), 
    control_name='control')


# In[37]:


# using all features
features = X_names 
slearner_rf.fit(X = df_train[features].values, 
                treatment = df_train['treatment_group_key'].values,
                y = df_train[y_name].values)
y_preds = slearner_rf.predict(df_test[features].values)


# In[38]:


# using top 10 features
features = top_10_features 
slearner_rf.fit(X = df_train[features].values, 
                treatment = df_train['treatment_group_key'].values,
                y = df_train[y_name].values)
y_preds_t10 = slearner_rf.predict(df_test[features].values)


# In[39]:


# using top 15 features
features = top_15_features 
slearner_rf.fit(X = df_train[features].values, 
                treatment = df_train['treatment_group_key'].values,
                y = df_train[y_name].values)
y_preds_t15 = slearner_rf.predict(df_test[features].values)


# In[40]:


# using top 20 features
features = top_20_features 
slearner_rf.fit(X = df_train[features].values, 
                treatment = df_train['treatment_group_key'].values,
                y = df_train[y_name].values)
y_preds_t20 = slearner_rf.predict(df_test[features].values)


# ### Print results for S Learner

# In[42]:


df_preds = pd.DataFrame([y_preds.ravel(), 
                         y_preds_t10.ravel(),
                         y_preds_t15.ravel(),
                         y_preds_t20.ravel(),
                         treatments,
                         df_test[y_name].ravel()],
                        index=['All', 'Top 10', 'Top 15', 'Top 20', 'is_treated', y_name]).T

plot_gain(df_preds, outcome_col=y_name, treatment_col='is_treated')


# In[43]:


# print out AUUC score
auuc_score(df_preds, outcome_col=y_name, treatment_col='is_treated')


# In this notebook, we demonstrated how our Filter method functions are able to select important features and enhance the AUUC performance (while the results might vary among different datasets, models and hyper-parameters).