#!/usr/bin/env python # coding: utf-8 # # Conditional Average Treatment Effects (CATE) with DoWhy and EconML # # This is an experimental feature where we use [EconML](https://github.com/microsoft/econml) methods from DoWhy. Using EconML allows CATE estimation using different methods. # # All four steps of causal inference in DoWhy remain the same: model, identify, estimate, and refute. The key difference is that we now call econml methods in the estimation step. There is also a simpler example using linear regression to understand the intuition behind CATE estimators. # # # In[1]: import os, sys sys.path.insert(1, os.path.abspath("../../../")) # for dowhy source code # In[2]: import numpy as np import pandas as pd import logging import dowhy from dowhy import CausalModel import dowhy.datasets import econml import warnings warnings.filterwarnings('ignore') # In[3]: data = dowhy.datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, num_treatments=1, treatment_is_binary=False, num_discrete_common_causes=2, num_discrete_effect_modifiers=0, one_hot_encode=False) df=data['df'] df.head() # In[4]: model = CausalModel(data=data["df"], treatment=data["treatment_name"], outcome=data["outcome_name"], graph=data["gml_graph"]) # In[5]: model.view_model() from IPython.display import Image, display display(Image(filename="causal_model.png")) # In[6]: identified_estimand= model.identify_effect() print(identified_estimand) # ## Linear Model # First, let us build some intuition using a linear model for estimating CATE. The effect modifiers (that lead to a heterogeneous treatment effect) can be modeled as interaction terms with the treatment. Thus, their value modulates the effect of treatment. # # Below the estimated effect of changing treatment from 0 to 1. # In[7]: linear_estimate = model.estimate_effect(identified_estimand, method_name="backdoor.linear_regression", control_value=0, treatment_value=1) print(linear_estimate) # ## EconML methods # We now move to the more advanced methods from the EconML package for estimating CATE. # # First, let us look at the double machine learning estimator. Method_name corresponds to the fully qualified name of the class that we want to use. For double ML, it is "econml.dml.DMLCateEstimator". # # Target units defines the units over which the causal estimate is to be computed. This can be a lambda function filter on the original dataframe, a new Pandas dataframe, or a string corresponding to the three main kinds of target units ("ate", "att" and "atc"). Below we show an example of a lambda function. # # Method_params are passed directly to EconML. For details on allowed parameters, refer to the EconML documentation. # In[8]: from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LassoCV from sklearn.ensemble import GradientBoostingRegressor dml_estimate = model.estimate_effect(identified_estimand, method_name="backdoor.econml.dml.DMLCateEstimator", control_value = 0, treatment_value = 1, target_units = lambda df: df["X0"]>1, # condition used for CATE confidence_intervals=False, method_params={"init_params":{'model_y':GradientBoostingRegressor(), 'model_t': GradientBoostingRegressor(), "model_final":LassoCV(), 'featurizer':PolynomialFeatures(degree=1, include_bias=True)}, "fit_params":{}}) print(dml_estimate) # In[9]: print("True causal estimate is", data["ate"]) # In[10]: dml_estimate = model.estimate_effect(identified_estimand, method_name="backdoor.econml.dml.DMLCateEstimator", control_value = 0, treatment_value = 1, target_units = 1, # condition used for CATE confidence_intervals=False, method_params={"init_params":{'model_y':GradientBoostingRegressor(), 'model_t': GradientBoostingRegressor(), "model_final":LassoCV(), 'featurizer':PolynomialFeatures(degree=1, include_bias=True)}, "fit_params":{}}) print(dml_estimate) # ### CATE Object and Confidence Intervals # In[ ]: from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LassoCV from sklearn.ensemble import GradientBoostingRegressor dml_estimate = model.estimate_effect(identified_estimand, method_name="backdoor.econml.dml.DMLCateEstimator", target_units = lambda df: df["X0"]>1, confidence_intervals=True, method_params={"init_params":{'model_y':GradientBoostingRegressor(), 'model_t': GradientBoostingRegressor(), "model_final":LassoCV(), 'featurizer':PolynomialFeatures(degree=1, include_bias=True)}, "fit_params":{ 'inference': 'bootstrap', } }) print(dml_estimate) print(dml_estimate.cate_estimates[:10]) print(dml_estimate.effect_intervals) # ### Can provide a new inputs as target units and estimate CATE on them. # In[ ]: test_cols= data['effect_modifier_names'] # only need effect modifiers' values test_arr = [np.random.uniform(0,1, 10) for _ in range(len(test_cols))] # all variables are sampled uniformly, sample of 10 test_df = pd.DataFrame(np.array(test_arr).transpose(), columns=test_cols) dml_estimate = model.estimate_effect(identified_estimand, method_name="backdoor.econml.dml.DMLCateEstimator", target_units = test_df, confidence_intervals=False, method_params={"init_params":{'model_y':GradientBoostingRegressor(), 'model_t': GradientBoostingRegressor(), "model_final":LassoCV(), 'featurizer':PolynomialFeatures(degree=1, include_bias=True)}, "fit_params":{} }) print(dml_estimate.cate_estimates) # ### Can also retrieve the raw EconML estimator object for any further operations # In[ ]: print(dml_estimate._estimator_object) dml_estimate # ## Works with any EconML method # In addition to double machine learning, below we example analyses using orthogonal forests, DRLearner (bug to fix), and neural network-based instrumental variables. # ### Continuous treatment, Continuous outcome # In[ ]: from sklearn.linear_model import LogisticRegression orthoforest_estimate = model.estimate_effect(identified_estimand, method_name="backdoor.econml.ortho_forest.ContinuousTreatmentOrthoForest", target_units = lambda df: df["X0"]>1, confidence_intervals=False, method_params={"init_params":{ 'n_trees':2, # not ideal, just as an example to speed up computation }, "fit_params":{} }) print(orthoforest_estimate) # ### Binary treatment, Binary outcome # In[ ]: data_binary = dowhy.datasets.linear_dataset(10, num_common_causes=4, num_samples=10000, num_instruments=2, num_effect_modifiers=2, treatment_is_binary=True, outcome_is_binary=True) # convert boolean values to {0,1} numeric data_binary['df'].v0 = data_binary['df'].v0.astype(int) data_binary['df'].y = data_binary['df'].y.astype(int) print(data_binary['df']) model_binary = CausalModel(data=data_binary["df"], treatment=data_binary["treatment_name"], outcome=data_binary["outcome_name"], graph=data_binary["gml_graph"]) identified_estimand_binary = model_binary.identify_effect(proceed_when_unidentifiable=True) # #### Using DRLearner estimator # In[ ]: from sklearn.linear_model import LogisticRegressionCV #todo needs binary y drlearner_estimate = model_binary.estimate_effect(identified_estimand_binary, method_name="backdoor.econml.drlearner.LinearDRLearner", target_units = lambda df: df["X0"]>1, confidence_intervals=False, method_params={"init_params":{ 'model_propensity': LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto') }, "fit_params":{} }) print(drlearner_estimate) # ### Instrumental Variable Method # In[ ]: import keras from econml.deepiv import DeepIVEstimator dims_zx = len(model._instruments)+len(model._effect_modifiers) dims_tx = len(model._treatment)+len(model._effect_modifiers) treatment_model = keras.Sequential([keras.layers.Dense(128, activation='relu', input_shape=(dims_zx,)), # sum of dims of Z and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17)]) response_model = keras.Sequential([keras.layers.Dense(128, activation='relu', input_shape=(dims_tx,)), # sum of dims of T and X keras.layers.Dropout(0.17), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(32, activation='relu'), keras.layers.Dropout(0.17), keras.layers.Dense(1)]) deepiv_estimate = model.estimate_effect(identified_estimand, method_name="iv.econml.deepiv.DeepIVEstimator", target_units = lambda df: df["X0"]>-1, confidence_intervals=False, method_params={"init_params":{'n_components': 10, # Number of gaussians in the mixture density networks 'm': lambda z, x: treatment_model(keras.layers.concatenate([z, x])), # Treatment model, "h": lambda t, x: response_model(keras.layers.concatenate([t, x])), # Response model 'n_samples': 1, # Number of samples used to estimate the response 'first_stage_options': {'epochs':25}, 'second_stage_options': {'epochs':25} }, "fit_params":{}}) print(deepiv_estimate) # ### Metalearners # In[ ]: data_experiment = dowhy.datasets.linear_dataset(10, num_common_causes=0, num_samples=10000, num_instruments=2, num_effect_modifiers=4, treatment_is_binary=True, outcome_is_binary=True) # convert boolean values to {0,1} numeric data_experiment['df'].v0 = data_experiment['df'].v0.astype(int) data_experiment['df'].y = data_experiment['df'].y.astype(int) print(data_experiment['df']) model_experiment = CausalModel(data=data_experiment["df"], treatment=data_experiment["treatment_name"], outcome=data_experiment["outcome_name"], graph=data_experiment["gml_graph"]) identified_estimand_experiment = model_experiment.identify_effect(proceed_when_unidentifiable=True) # In[ ]: from sklearn.linear_model import LogisticRegressionCV metalearner_estimate = model_experiment.estimate_effect(identified_estimand_experiment, method_name="backdoor.econml.metalearners.TLearner", target_units = lambda df: df["X0"]>1, confidence_intervals=False, method_params={"init_params":{ 'models': LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto') }, "fit_params":{} }) print(metalearner_estimate) # ## Refuting the estimate # ### Random # In[ ]: res_random=model.refute_estimate(identified_estimand, dml_estimate, method_name="random_common_cause") print(res_random) # ### Adding an unobserved common cause variable # In[ ]: res_unobserved=model.refute_estimate(identified_estimand, dml_estimate, method_name="add_unobserved_common_cause", confounders_effect_on_treatment="linear", confounders_effect_on_outcome="linear", effect_strength_on_treatment=0.01, effect_strength_on_outcome=0.02) print(res_unobserved) # ### Replacing treatment with a random (placebo) variable # In[ ]: res_placebo=model.refute_estimate(identified_estimand, dml_estimate, method_name="placebo_treatment_refuter", placebo_type="permute") print(res_placebo) # ### Removing a random subset of the data # In[ ]: res_subset=model.refute_estimate(identified_estimand, dml_estimate, method_name="data_subset_refuter", subset_fraction=0.8) print(res_subset) # More refutation methods to come, especially specific to the CATE estimators.