#!/usr/bin/env python
# coding: utf-8

# ### Lifetime value example 
# 
# 
# Suppose we have a subscription business that has monthly churn, and we'd like to know an estimate of LTV (lifetime value) and build confidence intervals for it. 
# 
# 
# Subscription businesses have a very predictable churn profile (it looks piecewise) but the rates are unknown. 
# 
# 
# 
# 
# 
# We'll use a piecewise-constant hazard model with known breakpoints, $\tau$.
# $$ 
# h(t) = \begin{cases}
#                         \lambda_0  & \text{if $t \le \tau_0$} \\
#                         \lambda_1 & \text{if $\tau_0 < t \le \tau_1$} \\
#                         \lambda_2 & \text{if $\tau_1 < t \le \tau_2$} \\
#                         ...
#                       \end{cases}
# $$
# 
# 

# In[31]:


get_ipython().run_line_magic('matplotlib', 'inline')
from autograd import numpy as np
from autograd import elementwise_grad, value_and_grad, hessian
from scipy.optimize import minimize

df = pd.read_csv("../churn_data.csv")
T = df['T'].values
E = df['E'].values


# In[2]:


breakpoints = np.array([28,  33,  58,  63,  88,  93,  117, 122, 148, 153])

def cumulative_hazard(params, times):
    # this is NumPy black magic to get piecewise hazards, let's chat after. 
    times = np.atleast_1d(times)
    n = times.shape[0]
    times = times.reshape((n, 1))
    M = np.minimum(np.tile(breakpoints, (n, 1)), times)
    M = np.hstack([M[:, tuple([0])], np.diff(M, axis=1)])
    return np.dot(M, params)

hazard = elementwise_grad(cumulative_hazard, argnum=1)

def survival_function(params, t):
    return np.exp(-cumulative_hazard(params, t))

def log_hazard(params, t):
    return np.log(np.clip(hazard(params, t), 1e-25, np.inf))

def log_likelihood(params, t, e):
    return np.sum(e * log_hazard(params, t)) - np.sum(cumulative_hazard(params, t))

def negative_log_likelihood(params, t, e):
    return -log_likelihood(params, t, e)

from autograd import value_and_grad

results = minimize(
        value_and_grad(negative_log_likelihood), 
        x0 = np.ones(len(breakpoints)),
        method=None, 
        args=(T, E),
        jac=True,
        bounds=[(0.0001, None)] * len(breakpoints)
)

print(results)
estimates_ = results.x
H = hessian(negative_log_likelihood)(estimates_, T, E)
variance_matrix_ = np.linalg.inv(H)


# In[3]:


t = np.linspace(.001, 150, 100)
plt.plot(t, survival_function(estimates_, t))
plt.ylim(0.5, 1)
plt.title("""Estimated survival function using \npiecewise hazards""");


# On day 30, we charge users \\$10, and on every 30 days after that, we charge \\$20. What's the LTV, and CIs, at the end of day 120?

# In[4]:


def LTV_120(params):
    # think about how complicated the gradient of this function is. Now imagine an even more
    # complicated function.
    ltv = 0
    ltv += 10 * survival_function(params, 30)
    for t in [60, 90, 120]:
        ltv += 20 * survival_function(params, t)
    return ltv


# In[5]:


ltv_ = LTV_120(estimates_)
print("LTV estimate: ", ltv_)


# In[6]:


from autograd import grad
var_ltv_= grad(LTV_120)(estimates_) @ variance_matrix_ @ grad(LTV_120)(estimates_)
print("Variance LTV estimate:", var_ltv_)


# In[7]:


std_ltv = np.sqrt(var_ltv_)
print("Estimated LTV at day 120: ($%.2f, $%.2f)" % (ltv_ - 1.96 * std_ltv, ltv_ + 1.96 * std_ltv))


# From here, we can compute p-values, scenario analysis, sensitvity analysis, etc. 
# 
# Let's continue this analytical train to Part 8. 

# ### Bonus, if time permits and Cameron is talking too fast. 
# 
# In the above model, we are not suggesting to the model much apriori information about this "predictable" process. For example, suppose we want a model that gives "inbetween" period rates to be close to each other, and likewise for the "jump" rates. 

# In[28]:


def negative_log_likelihood(params, t, e):
    return -log_likelihood(params, t, e) + 1e12 * (np.var(params[::2]) + np.var(params[1::2]))

from autograd import value_and_grad

results = minimize(
        value_and_grad(negative_log_likelihood), 
        x0 = np.ones(len(breakpoints)),
        method=None, 
        args=(T, E),
        jac=True,
        bounds=[(0.0001, None)] * len(breakpoints)
)

print(results)
estimates_ = results.x


# In[30]:


t = np.linspace(.001, 150, 100)
plt.plot(t, hazard(estimates_, t))
plt.title("""Estimated hazard function using \npiecewise hazards \nand penalizing variance""");


# ### Why do this?
# 
# 1) From a Bayesian perspective, this is a way to add prior information into a model. Note that if we have _lots_ of observations, the prior becomes less relevant (just like a traditional Bayesian model). That's good. *We are again using the likelihood to "add" information to our system.*
# 
# 2) When we have low data sizes, we can "borrow" information between periods. That is, deaths in the earlier "inbetween" periods can inform the rate in the later "inbetween" periods - thus we can do better inference. 
# 

# In[ ]: