#!/usr/bin/env python # coding: utf-8 # # Chapter 14: Statistical modelling # Robert Johansson # # Source code listings for [Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib](https://link.springer.com/book/10.1007/979-8-8688-0413-7) (ISBN 979-8-8688-0412-0). # In[1]: import statsmodels.api as sm # In[2]: import statsmodels.formula.api as smf # In[3]: import statsmodels.graphics.api as smg # In[4]: import patsy # In[5]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[6]: import matplotlib as mpl mpl.rcParams['mathtext.fontset'] = 'stix' mpl.rcParams['font.family'] = 'serif' mpl.rcParams['font.sans-serif'] = 'stix' # In[7]: import numpy as np # In[8]: import pandas as pd # In[9]: from scipy import stats # In[10]: import seaborn as sns # In[11]: sns.set(style="whitegrid") # ## Statistical models and patsy formula # In[12]: np.random.seed(123456789) # In[13]: y = np.array([1, 2, 3, 4, 5]) # In[14]: x1 = np.array([6, 7, 8, 9, 10]) # In[15]: x2 = np.array([11, 12, 13, 14, 15]) # In[16]: X = np.vstack([np.ones(5), x1, x2, x1*x2]).T # In[17]: X # In[18]: beta, res, rank, sval = np.linalg.lstsq(X, y) # In[19]: beta # In[20]: data = {"y": y, "x1": x1, "x2": x2} # In[21]: y, X = patsy.dmatrices("y ~ 1 + x1 + x2 + x1*x2", data) # In[22]: y # In[23]: X # In[24]: type(X) # In[25]: np.array(X) # In[26]: df_data = pd.DataFrame(data) # In[27]: y, X = patsy.dmatrices("y ~ 1 + x1 + x2 + x1:x2", df_data, return_type="dataframe") # In[28]: X # In[29]: model = sm.OLS(y, X) # In[30]: result = model.fit() # In[31]: result.params # In[32]: model = smf.ols("y ~ 1 + x1 + x2 + x1:x2", df_data) # In[33]: result = model.fit() # In[34]: result.params # In[35]: print(result.summary()) # In[36]: beta # In[37]: from collections import defaultdict # In[38]: data = defaultdict(lambda: np.array([1,2,3])) # In[39]: patsy.dmatrices("y ~ a", data=data)[1].design_info.term_names # In[40]: patsy.dmatrices("y ~ 1 + a + b", data=data)[1].design_info.term_names # In[41]: patsy.dmatrices("y ~ -1 + a + b", data=data)[1].design_info.term_names # In[42]: patsy.dmatrices("y ~ a * b", data=data)[1].design_info.term_names # In[43]: patsy.dmatrices("y ~ a * b * c", data=data)[1].design_info.term_names # In[44]: patsy.dmatrices("y ~ a * b * c - a:b:c", data=data)[1].design_info.term_names # In[45]: data = {k: np.array([]) for k in ["y", "a", "b", "c"]} # In[46]: patsy.dmatrices("y ~ a + b", data=data)[1].design_info.term_names # In[47]: patsy.dmatrices("y ~ I(a + b)", data=data)[1].design_info.term_names # In[48]: patsy.dmatrices("y ~ a*a", data=data)[1].design_info.term_names # In[49]: patsy.dmatrices("y ~ I(a**2)", data=data)[1].design_info.term_names # In[50]: patsy.dmatrices("y ~ np.log(a) + b", data=data)[1].design_info.term_names # In[51]: z = lambda x1, x2: x1+x2 # In[52]: patsy.dmatrices("y ~ z(a, b)", data=data)[1].design_info.term_names # ### Categorical variables # In[53]: data = {"y": [1, 2, 3], "a": [1, 2, 3]} # In[54]: patsy.dmatrices("y ~ - 1 + a", data=data, return_type="dataframe")[1] # In[55]: patsy.dmatrices("y ~ - 1 + C(a)", data=data, return_type="dataframe")[1] # In[56]: data = {"y": [1, 2, 3], "a": ["type A", "type B", "type C"]} # In[57]: patsy.dmatrices("y ~ - 1 + a", data=data, return_type="dataframe")[1] # In[58]: patsy.dmatrices("y ~ - 1 + C(a, Poly)", data=data, return_type="dataframe")[1] # # Linear regression # In[59]: np.random.seed(123456789) # In[60]: N = 100 # In[61]: x1 = np.random.randn(N) # In[62]: x2 = np.random.randn(N) # In[63]: data = pd.DataFrame({"x1": x1, "x2": x2}) # In[64]: def y_true(x1, x2): return 1 + 2 * x1 + 3 * x2 + 4 * x1 * x2 # In[65]: data["y_true"] = y_true(x1, x2) # In[66]: e = np.random.randn(N) # In[67]: data["y"] = data["y_true"] + e # In[68]: data.head() # In[69]: fig, axes = plt.subplots(1, 2, figsize=(8, 4)) axes[0].scatter(data["x1"], data["y"]) axes[1].scatter(data["x2"], data["y"]) fig.tight_layout() # In[70]: data.shape # In[71]: model = smf.ols("y ~ x1 + x2", data) # In[72]: result = model.fit() # In[73]: print(result.summary()) # In[74]: result.rsquared # In[75]: result.resid.head() # In[76]: z, p = stats.normaltest(result.resid.values) # In[77]: p # In[78]: result.params # In[79]: fig, ax = plt.subplots(figsize=(8, 4)) smg.qqplot(result.resid, ax=ax) fig.tight_layout() fig.savefig("ch14-qqplot-model-1.pdf") # In[80]: model = smf.ols("y ~ x1 + x2 + x1*x2", data) # In[81]: result = model.fit() # In[82]: print(result.summary()) # In[83]: result.params # In[84]: result.rsquared # In[85]: z, p = stats.normaltest(result.resid.values) # In[86]: p # In[87]: fig, ax = plt.subplots(figsize=(8, 4)) smg.qqplot(result.resid, ax=ax) fig.tight_layout() fig.savefig("ch14-qqplot-model-2.pdf") # In[88]: x = np.linspace(-1, 1, 50) # In[89]: X1, X2 = np.meshgrid(x, x) # In[90]: new_data = pd.DataFrame({"x1": X1.ravel(), "x2": X2.ravel()}) # In[91]: y_pred = result.predict(new_data) # In[92]: y_pred.shape # In[93]: y_pred = y_pred.values.reshape(50, 50) # In[94]: fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True) def plot_y_contour(ax, Y, title): c = ax.contourf(X1, X2, Y, 15, cmap=plt.cm.RdBu) ax.set_xlabel(r"$x_1$", fontsize=20) ax.set_ylabel(r"$x_2$", fontsize=20) ax.set_title(title) cb = fig.colorbar(c, ax=ax) cb.set_label(r"$y$", fontsize=20) plot_y_contour(axes[0], y_true(X1, X2), "true relation") plot_y_contour(axes[1], y_pred, "fitted model") fig.tight_layout() fig.savefig("ch14-comparison-model-true.pdf") # ### Datasets from R # In[95]: dataset = sm.datasets.get_rdataset("Icecream", "Ecdat") # In[96]: dataset.title # In[97]: print(dataset.__doc__) # In[98]: dataset.data.info() # In[99]: model = smf.ols("cons ~ -1 + price + temp", data=dataset.data) # In[100]: result = model.fit() # In[101]: print(result.summary()) # In[102]: fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) smg.plot_fit(result, 0, ax=ax1) smg.plot_fit(result, 1, ax=ax2) fig.tight_layout() fig.savefig("ch14-regressionplots.pdf") # In[103]: fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) sns.regplot(dataset.data, x="price", y="cons", ax=ax1); sns.regplot(dataset.data, x="temp", y="cons", ax=ax2); fig.tight_layout() fig.savefig("ch14-regressionplots-seaborn.pdf") # ## Discrete regression, logistic regression # In[104]: df = sm.datasets.get_rdataset("iris").data # In[105]: df.info() # In[106]: df.Species.unique() # In[107]: df_subset = df[(df.Species == "versicolor") | (df.Species == "virginica" )].copy() # In[108]: df_subset = df[df.Species.isin(["versicolor", "virginica"])].copy() # In[109]: df_subset.Species = df_subset.Species.map({"versicolor": 1, "virginica": 0}) # In[110]: df_subset.rename(columns={"Sepal.Length": "Sepal_Length", "Sepal.Width": "Sepal_Width", "Petal.Length": "Petal_Length", "Petal.Width": "Petal_Width"}, inplace=True) # In[111]: df_subset.head(3) # In[112]: model = smf.logit("Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width", data=df_subset) # In[113]: result = model.fit() # In[114]: print(result.summary()) # In[115]: print(result.get_margeff().summary()) # **Note:** Sepal_Length and Sepal_Width do not seem to contribute much to predictiveness of the model. # In[116]: model = smf.logit("Species ~ Petal_Length + Petal_Width", data=df_subset) # In[117]: result = model.fit() # In[118]: print(result.summary()) # In[119]: print(result.get_margeff().summary()) # In[120]: params = result.params beta0 = -params['Intercept']/params['Petal_Width'] beta1 = -params['Petal_Length']/params['Petal_Width'] # In[121]: df_new = pd.DataFrame({"Petal_Length": np.random.randn(20)*0.5 + 5, "Petal_Width": np.random.randn(20)*0.5 + 1.7}) # In[122]: df_new["P-Species"] = result.predict(df_new) # In[123]: df_new["P-Species"].head(3) # In[124]: df_new["Species"] = (df_new["P-Species"] > 0.5).astype(int) # In[125]: df_new.head() # In[126]: fig, ax = plt.subplots(1, 1, figsize=(8, 4)) ax.plot(df_subset[df_subset.Species == 0].Petal_Length.values, df_subset[df_subset.Species == 0].Petal_Width.values, 's', label='virginica') ax.plot(df_new[df_new.Species == 0].Petal_Length.values, df_new[df_new.Species == 0].Petal_Width.values, 'o', markersize=10, color="steelblue", label='virginica (pred.)') ax.plot(df_subset[df_subset.Species == 1].Petal_Length.values, df_subset[df_subset.Species == 1].Petal_Width.values, 's', label='versicolor') ax.plot(df_new[df_new.Species == 1].Petal_Length.values, df_new[df_new.Species == 1].Petal_Width.values, 'o', markersize=10, color="green", label='versicolor (pred.)') _x = np.array([4.0, 6.1]) ax.plot(_x, beta0 + beta1 * _x, 'k') ax.set_xlabel('Petal length') ax.set_ylabel('Petal width') ax.legend(loc=2) fig.tight_layout() fig.savefig("ch14-logit.pdf") # ### Poisson distribution # In[127]: dataset = sm.datasets.get_rdataset("discoveries") # In[128]: df = dataset.data.set_index("time").rename(columns={"value": "discoveries"}) # In[129]: df.head(10).T # In[130]: fig, ax = plt.subplots(1, 1, figsize=(16, 4)) df.plot(kind='bar', ax=ax) fig.tight_layout() fig.savefig("ch14-discoveries.pdf") # In[131]: model = smf.poisson("discoveries ~ 1", data=df) # In[132]: result = model.fit() # In[133]: print(result.summary()) # In[134]: lmbda = np.exp(result.params) # In[135]: X = stats.poisson(lmbda) # In[136]: result.conf_int() # In[137]: X_ci_l = stats.poisson(np.exp(result.conf_int().values)[0, 0]) # In[138]: X_ci_u = stats.poisson(np.exp(result.conf_int().values)[0, 1]) # In[139]: v, k = np.histogram(df.values, bins=12, range=(0, 12), density=True) # In[140]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) ax.bar(k[:-1], v, color="steelblue", align='center', label='Dicoveries per year') ax.bar(k-0.125, X_ci_l.pmf(k), color="red", alpha=0.5, align='center', width=0.25, label='Poisson fit (CI, lower)') ax.bar(k, X.pmf(k), color="green", align='center', width=0.5, label='Poisson fit') ax.bar(k+0.125, X_ci_u.pmf(k), color="red", alpha=0.5, align='center', width=0.25, label='Poisson fit (CI, upper)') ax.legend() fig.tight_layout() fig.savefig("ch14-discoveries-per-year.pdf") # ## Time series # In[141]: df = pd.read_csv("temperature_outdoor_2014.tsv", header=None, delimiter="\t", names=["time", "temp"]) df.time = pd.to_datetime(df.time, unit="s") df = df.set_index("time").resample("H").mean() # In[142]: df_march = df[df.index.month == 3] # In[143]: df_april = df[df.index.month == 4] # In[144]: df_march.plot(figsize=(12, 4)); # In[145]: fig, axes = plt.subplots(1, 4, figsize=(12, 3)) smg.tsa.plot_acf(df_march.temp, lags=72, ax=axes[0]) smg.tsa.plot_acf(df_march.temp.diff().dropna(), lags=72, ax=axes[1]) smg.tsa.plot_acf(df_march.temp.diff().diff().dropna(), lags=72, ax=axes[2]) smg.tsa.plot_acf(df_march.temp.diff().diff().diff().dropna(), lags=72, ax=axes[3]) fig.tight_layout() fig.savefig("ch14-timeseries-autocorrelation.pdf") # In[146]: from statsmodels.tsa import ar_model, arima_model # In[147]: sm.tsa.AR # In[148]: #help(sm.tsa) # In[149]: model = ar_model.AutoReg(df_march.temp, lags=72) # In[150]: #model = sm.tsa.AR(df_march.temp) # In[151]: result = model.fit() # In[152]: sm.stats.durbin_watson(result.resid) # In[153]: fig, ax = plt.subplots(1, 1, figsize=(8, 3)) smg.tsa.plot_acf(result.resid, lags=72, ax=ax) fig.tight_layout() fig.savefig("ch14-timeseries-resid-acf.pdf") # In[154]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) ax.plot(df_march.index.values[-72:], df_march.temp.values[-72:], label="train data") ax.plot(df_april.index.values[:72], df_april.temp.values[:72], label="actual outcome") ax.plot(pd.date_range("2014-04-01", "2014-04-4", freq="H").values, result.predict("2014-04-01", "2014-04-4"), label="predicted outcome") ax.legend() fig.tight_layout() fig.savefig("ch14-timeseries-prediction.pdf") # In[155]: # Using ARMA model on daily average temperatures # In[156]: df_march = df_march.resample("D").mean() # In[157]: df_april = df_april.resample("D").mean() # In[158]: import statsmodels.tsa.arima # In[159]: statsmodels.tsa.arima.model.ARIMA # In[160]: #model = sm.tsa.ARMA(df_march, (4, 1)) model = statsmodels.tsa.arima.model.ARIMA(df_march.temp, order=(4, 0, 1)) # In[161]: result = model.fit() # In[162]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) ax.plot(df_march.index.values[-7:], df_march.temp.values[-7:], 's-', label="train data") ax.plot(df_april.index.values[:7], df_april.temp.values[:7], 's-', label="actual outcome") ax.plot(pd.date_range("2014-04-01", "2014-04-7").values, result.predict("2014-04-01", "2014-04-7"), 's-', label="predicted outcome") ax.legend() fig.tight_layout() # # Versions # In[163]: get_ipython().run_line_magic('reload_ext', 'version_information') # In[164]: get_ipython().run_line_magic('version_information', 'numpy, matplotlib, pandas, scipy, statsmodels, patsy')