#!/usr/bin/env python # coding: utf-8 # # Chapter 12: Data processing and analysis with `pandas` # Robert Johansson # # Source code listings for [Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib](https://link.springer.com/book/10.1007/979-8-8688-0413-7) (ISBN 979-8-8688-0412-0). # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[2]: import numpy as np # In[3]: import pandas as pd # In[4]: # pd.set_option('display.mpl_style', 'default') # In[5]: import matplotlib as mpl mpl.style.use('ggplot') # In[6]: import seaborn as sns # ## Series object # In[7]: s = pd.Series([909976, 8615246, 2872086, 2273305]) # In[8]: s # In[9]: type(s) # In[10]: s.dtype # In[11]: s.index # In[12]: s.values # In[13]: s.index = ["Stockholm", "London", "Rome", "Paris"] # In[14]: s.name = "Population" # In[15]: s # In[16]: s = pd.Series([909976, 8615246, 2872086, 2273305], index=["Stockholm", "London", "Rome", "Paris"], name="Population") # In[17]: s["London"] # In[18]: s.Stockholm # In[19]: s[["Paris", "Rome"]] # In[20]: s.median(), s.mean(), s.std() # In[21]: s.min(), s.max() # In[22]: s.quantile(q=0.25), s.quantile(q=0.5), s.quantile(q=0.75) # In[23]: s.describe() # In[24]: fig, axes = plt.subplots(1, 4, figsize=(12, 3.5)) s.plot(ax=axes[0], kind='line', title="line") s.plot(ax=axes[1], kind='bar', title="bar") s.plot(ax=axes[2], kind='box', title="box") s.plot(ax=axes[3], kind='pie', title="pie") fig.tight_layout() fig.savefig("ch12-series-plot.pdf") fig.savefig("ch12-series-plot.png") # ## DataFrame object # In[25]: df = pd.DataFrame([[909976, 8615246, 2872086, 2273305], ["Sweden", "United kingdom", "Italy", "France"]]) # In[26]: df # In[27]: df = pd.DataFrame([[909976, "Sweden"], [8615246, "United kingdom"], [2872086, "Italy"], [2273305, "France"]]) # In[28]: df # In[29]: df.index = ["Stockholm", "London", "Rome", "Paris"] # In[30]: df.columns = ["Population", "State"] # In[31]: df # In[32]: df = pd.DataFrame([[909976, "Sweden"], [8615246, "United kingdom"], [2872086, "Italy"], [2273305, "France"]], index=["Stockholm", "London", "Rome", "Paris"], columns=["Population", "State"]) # In[33]: df # In[34]: df = pd.DataFrame({"Population": [909976, 8615246, 2872086, 2273305], "State": ["Sweden", "United kingdom", "Italy", "France"]}, index=["Stockholm", "London", "Rome", "Paris"]) # In[35]: df # In[36]: df.index # In[37]: df.columns # In[38]: df.values # In[39]: df.Population # In[40]: df["Population"] # In[41]: type(df.Population) # In[42]: df.Population.Stockholm # In[43]: type(df.index) # In[44]: df.loc["Stockholm"] # In[45]: type(df.loc["Stockholm"]) # In[46]: df.loc[["Paris", "Rome"]] # In[47]: df.loc[["Paris", "Rome"], "Population"] # In[48]: df.loc["Paris", "Population"] # In[49]: df[["Population"]].mean() # In[50]: df.info() # In[51]: df.dtypes # In[52]: df.head() # In[58]: get_ipython().system('head -n5 european_cities.csv') # ## Larger dataset # In[59]: df_pop = pd.read_csv("european_cities.csv") # In[60]: df_pop.head() # In[61]: df_pop = pd.read_csv("european_cities.csv", delimiter=",", encoding="utf-8", header=0) # In[62]: df_pop.info() # In[63]: df_pop.head() # In[64]: df_pop["NumericPopulation"] = df_pop.Population.apply(lambda x: int(x.replace(",", ""))) # In[65]: df_pop["State"].values[:3] # In[66]: df_pop["State"] = df_pop["State"].apply(lambda x: x.strip()) # In[67]: df_pop.head() # In[68]: df_pop.dtypes # In[69]: df_pop2 = df_pop.set_index("City") # In[70]: df_pop2 = df_pop2.sort_index() # In[71]: df_pop2.head() # In[72]: df_pop2.head() # In[73]: df_pop3 = df_pop.set_index(["State", "City"]).sort_index(level=0) # In[74]: df_pop3.head(7) # In[75]: df_pop3.loc["Sweden"] # In[76]: df_pop3.loc[("Sweden", "Gothenburg")] # In[77]: df_pop.set_index("City").sort_values(["State", "NumericPopulation"], ascending=[False, True]).head() # In[78]: city_counts = df_pop.State.value_counts() # In[79]: city_counts.name = "# cities in top 105" # In[80]: df_pop3 = df_pop[["State", "City", "NumericPopulation"]].set_index(["State", "City"]) df_pop3 # In[81]: #df_pop3.sum(level="State") #df_pop4 = df_pop3.sum(level="State").sort_values("NumericPopulation", ascending=False) #df_pop4 # In[82]: df_pop4 = df_pop3.groupby(level="State").sum().sort_values("NumericPopulation", ascending=False) # In[83]: df_pop4.head() # In[84]: df_pop # In[85]: df_pop5 = (df_pop[["State", "NumericPopulation"]] .groupby("State").sum() .sort_values("NumericPopulation", ascending=False)) # In[86]: df_pop5 = (df_pop.drop("Rank", axis=1)#[["State", "NumericPopulation"]] .groupby("State").sum() .sort_values("NumericPopulation", ascending=False)) # In[87]: df_pop5.head() # In[88]: fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) city_counts.plot(kind='barh', ax=ax1) ax1.set_xlabel("# cities in top 105") df_pop5.NumericPopulation.plot(kind='barh', ax=ax2) ax2.set_xlabel("Total pop. in top 105 cities") fig.tight_layout() fig.savefig("ch12-state-city-counts-sum.pdf") # ## Time series # ### Basics # In[89]: import datetime # In[90]: pd.date_range("2015-1-1", periods=31) # In[91]: pd.date_range(datetime.datetime(2015, 1, 1), periods=31) # In[92]: pd.date_range("2015-1-1 00:00", "2015-1-1 12:00", freq="H") # In[93]: ts1 = pd.Series(np.arange(31), index=pd.date_range("2015-1-1", periods=31)) # In[94]: ts1.head() # In[95]: ts1["2015-1-3"] # In[96]: ts1.index[2] # In[97]: ts1.index[2].year, ts1.index[2].month, ts1.index[2].day # In[98]: ts1.index[2].nanosecond # In[99]: ts1.index[2].to_pydatetime() # In[100]: ts2 = pd.Series(np.random.rand(2), index=[datetime.datetime(2015, 1, 1), datetime.datetime(2015, 2, 1)]) # In[101]: ts2 # In[102]: periods = pd.PeriodIndex([pd.Period('2015-01'), pd.Period('2015-02'), pd.Period('2015-03')]) # In[103]: ts3 = pd.Series(np.random.rand(3), periods) # In[104]: ts3 # In[105]: ts3.index # In[106]: ts2.to_period('M') # In[107]: pd.date_range("2015-1-1", periods=12, freq="M").to_period() # ### Temperature time series example # In[108]: get_ipython().system('head -n 5 temperature_outdoor_2014.tsv') # In[109]: df1 = pd.read_csv('temperature_outdoor_2014.tsv', delimiter="\t", names=["time", "outdoor"]) # In[110]: df2 = pd.read_csv('temperature_indoor_2014.tsv', delimiter="\t", names=["time", "indoor"]) # In[111]: df1.head() # In[112]: df2.head() # In[113]: df1.time = (pd.to_datetime(df1.time.values, unit="s") .tz_localize('UTC').tz_convert('Europe/Stockholm')) # In[114]: df1 = df1.set_index("time") # In[115]: df2.time = (pd.to_datetime(df2.time.values, unit="s") .tz_localize('UTC').tz_convert('Europe/Stockholm')) # In[116]: df2 = df2.set_index("time") # In[117]: df1.head() # In[118]: df1.index[0] # In[119]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) df1.plot(ax=ax) df2.plot(ax=ax) fig.tight_layout() fig.savefig("ch12-timeseries-temperature-2014.pdf") # In[120]: # select january data # In[121]: df1.info() # In[122]: df1_jan = df1[(df1.index > "2014-1-1") & (df1.index < "2014-2-1")] # In[123]: df1.index < "2014-2-1" # In[124]: df1_jan.info() # In[125]: df2_jan = df2["2014-1-1":"2014-1-31"] # In[126]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) df1_jan.plot(ax=ax) df2_jan.plot(ax=ax) fig.tight_layout() fig.savefig("ch12-timeseries-selected-month.pdf") # In[127]: # group by month # In[128]: df1_month = df1.reset_index() # In[129]: df1_month["month"] = df1_month.time.apply(lambda x: x.month) # In[130]: df1_month.head() # In[131]: df1_month = df1_month[["month", "outdoor"]].groupby("month").aggregate(np.mean) # In[132]: df2_month = df2.reset_index() # In[133]: df2_month["month"] = df2_month.time.apply(lambda x: x.month) # In[134]: df2_month = df2_month[["month", "indoor"]].groupby("month").aggregate(np.mean) # In[135]: df1_month.head(4) # In[136]: df2_month.head(4) # In[137]: df_month = df1_month[["outdoor"]].join(df2_month[["indoor"]]) # In[138]: df_month.head(3) # In[139]: df_month = pd.concat([df.to_period("M").groupby(level=0).mean() for df in [df1, df2]], axis=1) # In[140]: df_month.head(3) # In[141]: fig, axes = plt.subplots(1, 2, figsize=(12, 4)) df_month.plot(kind='bar', ax=axes[0]) df_month.plot(kind='box', ax=axes[1]) fig.tight_layout() fig.savefig("ch12-grouped-by-month.pdf") # In[142]: df_month # In[143]: # resampling # In[144]: df1_hour = df1.resample("H").mean() # In[145]: df1_hour.columns = ["outdoor (hourly avg.)"] # In[146]: df1_day = df1.resample("D").mean() # In[147]: df1_day.columns = ["outdoor (daily avg.)"] # In[148]: df1_week = df1.resample("7D").mean() # In[149]: df1_week.columns = ["outdoor (weekly avg.)"] # In[150]: df1_month = df1.resample("M").mean() # In[151]: df1_month.columns = ["outdoor (monthly avg.)"] # In[152]: df_diff = (df1.resample("D").mean().outdoor - df2.resample("D").mean().indoor) # In[153]: fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6)) df1_hour.plot(ax=ax1, alpha=0.25) df1_day.plot(ax=ax1) df1_week.plot(ax=ax1) df1_month.plot(ax=ax1) df_diff.plot(ax=ax2) ax2.set_title("temperature difference between outdoor and indoor") fig.tight_layout() fig.savefig("ch12-timeseries-resampled.pdf") # In[154]: pd.concat([df1.resample("5min").mean().rename(columns={"outdoor": 'None'}), df1.resample("5min").ffill().rename(columns={"outdoor": 'ffill'}), df1.resample("5min").bfill().rename(columns={"outdoor": 'bfill'})], axis=1).head() # ## Selected day # In[155]: df1_dec25 = df1[(df1.index < "2014-9-1") & (df1.index >= "2014-8-1")].resample("D") # In[156]: df1_dec25 = df1.loc["2014-12-25"] # In[157]: df1_dec25.head(5) # In[158]: df2_dec25 = df2.loc["2014-12-25"] # In[159]: df2_dec25.head(5) # In[160]: df1_dec25.describe().T # In[161]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) df1_dec25.plot(ax=ax) fig.savefig("ch12-timeseries-selected-month-12.pdf") # In[162]: df1.index # # Seaborn statistical visualization library # In[163]: sns.set(style="darkgrid") # In[164]: #sns.set(style="whitegrid") # In[165]: df1 = pd.read_csv('temperature_outdoor_2014.tsv', delimiter="\t", names=["time", "outdoor"]) df1.time = pd.to_datetime(df1.time.values, unit="s").tz_localize('UTC').tz_convert('Europe/Stockholm') df1 = df1.set_index("time").resample("10min").mean() df2 = pd.read_csv('temperature_indoor_2014.tsv', delimiter="\t", names=["time", "indoor"]) df2.time = pd.to_datetime(df2.time.values, unit="s").tz_localize('UTC').tz_convert('Europe/Stockholm') df2 = df2.set_index("time").resample("10min").mean() df_temp = pd.concat([df1, df2], axis=1) # In[166]: fig, ax = plt.subplots(1, 1, figsize=(8, 4)) df_temp.resample("D").mean().plot(y=["outdoor", "indoor"], ax=ax) fig.tight_layout() fig.savefig("ch12-seaborn-plot.pdf") # In[167]: #sns.kdeplot(df_temp["outdoor"].dropna().values, shade=True, cumulative=True); # In[168]: sns.histplot(df_temp.to_period("M")["outdoor"]["2014-04"].dropna().values, bins=50, kde=True) sns.histplot(df_temp.to_period("M")["indoor"]["2014-04"].dropna().values, bins=50, kde=True) plt.savefig("ch12-seaborn-distplot.pdf") # In[169]: with sns.axes_style("white"): sns.jointplot(x=df_temp.resample("H").mean()["outdoor"].values, y=df_temp.resample("H").mean()["indoor"].values, kind="hex"); plt.savefig("ch12-seaborn-jointplot.pdf") # In[170]: sns.kdeplot(x=df_temp.resample("H").mean()["outdoor"].dropna().values, y=df_temp.resample("H").mean()["indoor"].dropna().values, fill=False); plt.savefig("ch12-seaborn-kdeplot.pdf") # In[171]: fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4)) sns.boxplot(df_temp.dropna(), ax=ax1, palette="pastel") sns.violinplot(df_temp.dropna(), ax=ax2, palette="pastel") fig.tight_layout() fig.savefig("ch12-seaborn-boxplot-violinplot.pdf") # In[172]: sns.violinplot(x=df_temp.dropna().index.month, y=df_temp.dropna().outdoor, color="skyblue"); plt.savefig("ch12-seaborn-violinplot.pdf") # In[173]: df_temp["month"] = df_temp.index.month df_temp["hour"] = df_temp.index.hour # In[174]: df_temp.head() # In[175]: table = pd.pivot_table(df_temp, values='outdoor', index=['month'], columns=['hour'], aggfunc=np.mean) # In[176]: table # In[177]: fig, ax = plt.subplots(1, 1, figsize=(8, 4)) sns.heatmap(table, ax=ax); fig.tight_layout() fig.savefig("ch12-seaborn-heatmap.pdf") # ## Versions # In[178]: get_ipython().run_line_magic('reload_ext', 'version_information') # In[179]: get_ipython().run_line_magic('version_information', 'numpy, matplotlib, pandas, seaborn')