#!/usr/bin/env python
# coding: utf-8

# # Demo for the DoWhy causal API
# We show a simple example of adding a causal extension to any dataframe. 

# In[1]:


import os, sys
sys.path.append(os.path.abspath("../../../"))


# In[2]:


import dowhy.datasets
import dowhy.api

import numpy as np
import pandas as pd

from statsmodels.api import OLS


# In[3]:


data = dowhy.datasets.linear_dataset(beta=5,
        num_common_causes=1,
        num_instruments = 0,
        num_samples=1000,
        treatment_is_binary=True)
df = data['df']
df['y'] = df['y'] + np.random.normal(size=len(df)) # Adding noise to data. Without noise, the variance in Y|X, Z is zero, and mcmc fails.
#data['dot_graph'] = 'digraph { v ->y;X0-> v;X0-> y;}'

treatment= data["treatment_name"][0]
outcome = data["outcome_name"][0]
common_cause = data["common_causes_names"][0]
df


# In[4]:


# data['df'] is just a regular pandas.DataFrame
df.causal.do(x=treatment,
                     variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'},
                     outcome=outcome,
                     common_causes=[common_cause]).groupby(treatment).mean().plot(y=outcome, kind='bar')


# In[5]:


df.causal.do(x={treatment: 1}, 
              variable_types={treatment:'b', outcome: 'c', common_cause: 'c'}, 
              outcome=outcome,
              method='weighting', 
              common_causes=[common_cause],
              proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')


# In[6]:


cdf_1 = df.causal.do(x={treatment: 1}, 
              variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'}, 
              outcome=outcome, 
              dot_graph=data['dot_graph'],
              proceed_when_unidentifiable=True)

cdf_0 = df.causal.do(x={treatment: 0}, 
              variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'}, 
              outcome=outcome, 
              dot_graph=data['dot_graph'],
              proceed_when_unidentifiable=True)


# In[7]:


cdf_0


# In[8]:


cdf_1


# ## Comparing the estimate to Linear Regression
# First, estimating the effect using the causal data frame, and the 95% confidence interval.

# In[9]:


(cdf_1['y'] - cdf_0['y']).mean()


# In[10]:


1.96*(cdf_1['y'] - cdf_0['y']).std() / np.sqrt(len(df))


# Comparing to the estimate from OLS.

# In[11]:


model = OLS(np.asarray(df[outcome]), np.asarray(df[[common_cause, treatment]], dtype=np.float64))
result = model.fit()
result.summary()