from dowhy import CausalModel
import dowhy.datasets
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import logging
#logging.getLogger("dowhy").setLevel(logging.INFO)
np.random.seed(25)
dowhy.__version__
'0.4'
data = dowhy.datasets.linear_dataset(10, num_common_causes=2, num_samples=100000,
num_instruments=0, num_effect_modifiers=2,
num_treatments=1,
treatment_is_binary=True,
outcome_is_binary=False)
df = data['df']
print(df.head())
print(data)
X0 X1 W0 W1 v0 y
0 -0.690365 -1.110144 -0.099337 -0.426628 False -0.491223
1 0.417015 -0.511456 -0.216640 -0.057772 True 8.814807
2 0.223920 0.426649 0.120333 2.002458 True 12.894260
3 0.854081 -0.545430 -1.312982 1.033137 True 6.092524
4 0.008900 0.277964 -0.180486 -0.067758 True 10.150629
{'df': X0 X1 W0 W1 v0 y
0 -0.690365 -1.110144 -0.099337 -0.426628 False -0.491223
1 0.417015 -0.511456 -0.216640 -0.057772 True 8.814807
2 0.223920 0.426649 0.120333 2.002458 True 12.894260
3 0.854081 -0.545430 -1.312982 1.033137 True 6.092524
4 0.008900 0.277964 -0.180486 -0.067758 True 10.150629
... ... ... ... ... ... ...
99995 1.496265 0.613054 0.713475 -0.788403 False 2.482054
99996 1.555262 0.013880 1.995508 0.566159 True 22.096263
99997 -0.814917 0.254763 0.919172 -0.694613 False 3.284822
99998 0.554770 1.331817 0.001694 0.521115 True 15.661477
99999 0.396213 0.100599 2.093892 -1.172198 False 7.594258
[100000 rows x 6 columns], 'treatment_name': ['v0'], 'outcome_name': 'y', 'common_causes_names': ['W0', 'W1'], 'instrument_names': [], 'effect_modifier_names': ['X0', 'X1'], 'dot_graph': 'digraph { U[label="Unobserved Confounders"]; U->y;v0->y; U->v0;W0-> v0; W1-> v0;W0-> y; W1-> y;X0-> y; X1-> y;}', 'gml_graph': 'graph[directed 1node[ id "y" label "y"]node[ id "Unobserved Confounders" label "Unobserved Confounders"]edge[source "Unobserved Confounders" target "y"]node[ id "W0" label "W0"] node[ id "W1" label "W1"]node[ id "v0" label "v0"]edge[source "v0" target "y"]edge[source "Unobserved Confounders" target "v0"]edge[ source "W0" target "v0"] edge[ source "W1" target "v0"]edge[ source "W0" target "y"] edge[ source "W1" target "y"]node[ id "X0" label "X0"] edge[ source "X0" target "y"] node[ id "X1" label "X1"] edge[ source "X1" target "y"]]', 'ate': 11.759597003791514}
model = CausalModel(df,
data["treatment_name"],data["outcome_name"],
data["dot_graph"],
missing_nodes_as_confounders=True,
logging_level=logging.INFO)
print("CC", model._common_causes)
print("EM", model._effect_modifiers)
model.view_model()
from IPython.display import Image, display
display(Image(filename="causal_model.png"))
INFO:dowhy.causal_model:Model to find the causal effect of treatment ['v0'] on outcome ['y']
CC ['U', 'W1', 'W0'] EM ['X1', 'X0']
logging.getLogger("causal_estimar")
<Logger causal_estimar (INFO)>
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
import time
start=time.time()
causal_estimate = model.estimate_effect(identified_estimand,
method_name="backdoor.linear_regression",
confidence_intervals=False,
test_significance=False,
method_params = {
'num_null_simulations':10,
'num_simulations':10,
'num_quantiles_to_discretize_cont_cols':10,
'fit_method': "statsmodels",
'need_conditional_estimates':False
},
)
end=time.time()
print(end-start)
#print(causal_estimate.estimator._linear_model.summary())
#print(causal_estimate)
INFO:dowhy.causal_identifier:Common causes of treatment and outcome:['U', 'W1', 'W0'] WARNING:dowhy.causal_identifier:If this is observed data (not from a randomized experiment), there might always be missing confounders. Causal effect cannot be identified perfectly. INFO:dowhy.causal_identifier:Continuing by ignoring these unobserved confounders because proceed_when_unidentifiable flag is True. INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[] INFO:dowhy.causal_estimator:INFO: Using Linear Regression Estimator INFO:dowhy.causal_estimator:b: y~v0+W1+W0+v0*X1+v0*X0
5.589203357696533
causal_estimate.test_stat_significance(num_simulations=2)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-6-a8010fa4491b> in <module> ----> 1 causal_estimate.test_stat_significance(num_simulations=2) /mnt/c/Users/amit_/code/dowhy/dowhy/causal_estimator.py in test_stat_significance(self, method, **kwargs) 693 signif_results = self.estimator.test_significance(self.value, 694 method=method, --> 695 **kwargs) 696 return {'p_value': signif_results["p_value"]} 697 /mnt/c/Users/amit_/code/dowhy/dowhy/causal_estimator.py in test_significance(self, estimate_value, method, **kwargs) 536 if method == "default" or method is True: # user has not provided any method 537 try: --> 538 signif_dict = self._test_significance(estimate_value, method, **kwargs) 539 except NotImplementedError: 540 signif_dict = self._test_significance_with_bootstrap(estimate_value, **kwargs) TypeError: _test_significance() got an unexpected keyword argument 'num_simulations'
print(causal_estimate.estimate_conditional_effects(effect_modifiers=["X1" ]).head(20))
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor
dml_estimate = model.estimate_effect(identified_estimand, method_name="backdoor.econml.dml.DMLCateEstimator",
control_value = 0,
treatment_value = 1,
target_units = "ate", # condition used for CATE
confidence_intervals=False,
method_params={"init_params":{'model_y':GradientBoostingRegressor(),
'model_t': GradientBoostingRegressor(),
"model_final":LassoCV(),
'featurizer':PolynomialFeatures(degree=1, include_bias=True)},
"fit_params":{}})
print(dml_estimate)
print(causal_estimate.test_stat_significance(method="bootstrap", num_null_simulations=2))
print(causal_estimate.test_stat_significance())
causal_estimate.test_stat_significance()
print(causal_estimate.get_standard_error(method="bootstrap", num_ci_simulations=10, sample_size_fraction=0.9))
print(causal_estimate.get_confidence_intervals(confidence_level=0.99))
print(causal_estimate.get_confidence_intervals(method="bootstrap", confidence_level=0.95, num_ci_simulations=10))
print(causal_estimate.get_confidence_intervals(method="psm", confidence_level=0.95, num_ci_simulations=10))
str(model.refute_estimate(identified_estimand, causal_estimate, method_name="placebo_treatment_refuter", num_simulations=1))
INFO:dowhy.causal_refuters.placebo_treatment_refuter:Refutation over 1 simulated datasets of Random Data treatment
INFO:dowhy.causal_refuters.placebo_treatment_refuter:Using a Binomial Distribution with 1 trials and 0.5 probability of success
INFO:dowhy.causal_estimator:INFO: Using Linear Regression Estimator
INFO:dowhy.causal_estimator:b: y~placebo+W1+W0+placebo*X1+placebo*X0
WARNING:dowhy.causal_refuters.placebo_treatment_refuter:We assume a Normal Distribution as the sample has less than 100 examples.
Note: The underlying distribution may not be Normal. We assume that it approaches normal with the increase in sample size.
'Refute: Use a Placebo Treatment\nEstimated effect:11.759597003788148\nNew effect:0.0185251357744729\np value:0.0\n'
import statsmodels.formula.api as smf
res=smf.ols(formula="y~v0+W0+W1+W2+W3", data=df).fit()
res.summary()
lm = causal_estimate.estimator._linear_model
print(lm.pvalues)
print(type(lm.conf_int()))
print(lm.bse)
lm.summary()
print(causal_estimate.get_confidence_intervals(confidence_level =0.9))
print(causal_estimate.get_standard_error())
print(causal_estimate.test_stat_significance())