#!/usr/bin/env python # coding: utf-8 # # Demo for the DoWhy causal API # We show a simple example of adding a causal extension to any dataframe. # In[1]: import os, sys sys.path.append(os.path.abspath("../../../")) # In[2]: import dowhy.datasets import dowhy.api import numpy as np import pandas as pd from statsmodels.api import OLS # In[3]: data = dowhy.datasets.linear_dataset(beta=5, num_common_causes=1, num_instruments = 0, num_samples=1000, treatment_is_binary=True) df = data['df'] df['y'] = df['y'] + np.random.normal(size=len(df)) # Adding noise to data. Without noise, the variance in Y|X, Z is zero, and mcmc fails. #data['dot_graph'] = 'digraph { v ->y;X0-> v;X0-> y;}' treatment= data["treatment_name"][0] outcome = data["outcome_name"][0] common_cause = data["common_causes_names"][0] df # In[4]: # data['df'] is just a regular pandas.DataFrame df.causal.do(x=treatment, variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'}, outcome=outcome, common_causes=[common_cause]).groupby(treatment).mean().plot(y=outcome, kind='bar') # In[5]: df.causal.do(x={treatment: 1}, variable_types={treatment:'b', outcome: 'c', common_cause: 'c'}, outcome=outcome, method='weighting', common_causes=[common_cause], proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar') # In[6]: cdf_1 = df.causal.do(x={treatment: 1}, variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'}, outcome=outcome, dot_graph=data['dot_graph'], proceed_when_unidentifiable=True) cdf_0 = df.causal.do(x={treatment: 0}, variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'}, outcome=outcome, dot_graph=data['dot_graph'], proceed_when_unidentifiable=True) # In[7]: cdf_0 # In[8]: cdf_1 # ## Comparing the estimate to Linear Regression # First, estimating the effect using the causal data frame, and the 95% confidence interval. # In[9]: (cdf_1['y'] - cdf_0['y']).mean() # In[10]: 1.96*(cdf_1['y'] - cdf_0['y']).std() / np.sqrt(len(df)) # Comparing to the estimate from OLS. # In[11]: model = OLS(np.asarray(df[outcome]), np.asarray(df[[common_cause, treatment]], dtype=np.float64)) result = model.fit() result.summary()