#!/usr/bin/env python # coding: utf-8 # # Working with Data in Python (Part 2) # # For the next example, we are going to be using the Python data analysis library `pandas`, which lets the user explore tabular datasets and perform complex search, indexing, statistical, and other operations. # # # ## Example: World Population Growth # Read in the UN population data # In[ ]: import pandas as pd data = pd.read_csv('Data/population.csv') # Print the data # In[ ]: data # Print just the first few rows of data. # In[ ]: data.head() # Print column names # In[ ]: data.columns # ### Select columns # In[ ]: my_columns = ['Year','Series','Value'] data[my_columns].head() # Select data based on a matching criterion. # In[ ]: year = 2005 data[data['Year'] == year].head(n=20) # In[ ]: series = "Population mid-year estimates (millions)" data[data['Series'] == series] # In[ ]: # We can contruct more complex matching criteria. Here we want all # the mid-year population estimates for Canada. query = (data["Region/Country/Area"] == "Canada") & \ (data["Series"] == "Population mid-year estimates (millions)") data[query] # In[ ]: # We can contruct more complex matching criteria. Here we want all # the mid-year population estimates for Canada. query = (data["Region/Country/Area"] == "Germany") & \ (data["Series"] == "Population mid-year estimates (millions)") data[query] # In[ ]: import pandas as pd world = pd.read_csv('Data/world_population.csv') # In[ ]: world.head(n=20) # In[ ]: world = world[::-1] # In[ ]: high = world[world["Variant"] == "High"] med = world[world["Variant"] == "Medium"] low = world[world["Variant"] == "Low"] # ### Plot the world population by year for the three scenarios # In[ ]: import matplotlib.pyplot as plt # Get the data for each variant, store as arrays years_h = high["Year(s)"].values years_m = med["Year(s)"].values years_l = low["Year(s)"].values # Population in thousands, convert to billions pop_h = high["Value"].values / 1.0e6 pop_m = med["Value"].values / 1.0e6 pop_l = low["Value"].values / 1.0e6 # Plot population against against years plt.plot(years_l, pop_l) plt.plot(years_m, pop_m) plt.plot(years_h, pop_h) plt.legend(["Low", "Medium", "High"]) plt.grid(True, alpha=0.3) # ## Learn More # # You can learn more about `pandas` by visiting the [homepage](https://pandas.pydata.org/). # # For a 10-minute tutorial, read "[10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html)". # # Introduction to Data Mining # # As summarized on [Wikipedia](https://en.wikipedia.org/wiki/Data_mining), data mining involves six common classes of tasks: # # * **Anomaly detection** (outlier/change/deviation detection) – The identification of unusual data records, that might be interesting or data errors that require further investigation. # # * **Association rule learning** (dependency modelling) – Searches for relationships between variables. For example, a supermarket might gather data on customer purchasing habits. Using association rule learning, the supermarket can determine which products are frequently bought together and use this information for marketing purposes. This is sometimes referred to as market basket analysis. # # * **Clustering** – is the task of discovering groups and structures in the data that are in some way or another "similar", without using known structures in the data. # # * **Classification** – is the task of generalizing known structure to apply to new data. For example, an e-mail program might attempt to classify an e-mail as "legitimate" or as "spam". # # * **Regression** – attempts to find a function which models the data with the least error that is, for estimating the relationships among data or datasets. # # * **Summarization** – providing a more compact representation of the data set, including visualization and report generation. # # Clustering # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt from sklearn import cluster, datasets from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler n_samples = 1500 blobs = datasets.make_blobs(n_samples=n_samples, centers=3, random_state=8) X, y = blobs # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # In[ ]: np.shape(X) # In[ ]: print(X[:10,:]) # In[ ]: plt.scatter(X[:,0],X[:,1]) # ## Apply K-Means Clustering # # Read more about the nature of the algorithm [here](https://en.wikipedia.org/wiki/K-means_clustering). # In[ ]: kmeans = KMeans(n_clusters=3) kmeans.fit(X) # In[ ]: predicted_categories = kmeans.predict(X) # In[ ]: for i in range(kmeans.n_clusters): plt.scatter(X[predicted_categories == i, 0], X[predicted_categories == i, 1], label='Category {}'.format(i)) plt.legend() # In[ ]: print(kmeans.cluster_centers_) # ## Exercise: What if you didn't know the true number of clusters ahead of time? Could you somehow measure the quality of your model? # # Outlier Detection # In[ ]: get_ipython().system('pip install -U scikit-learn') # In[ ]: import numpy as np import matplotlib.pyplot as plt from sklearn.neighbors import LocalOutlierFactor # Generate training data X = 0.3 * np.random.randn(100, 2) # Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) X = np.vstack([X+2, X-2, X_outliers]) # In[ ]: plt.scatter(X[:, 0], X[:, 1], c='white', edgecolor='k', s=20) # In[ ]: # fit the model clf = LocalOutlierFactor(n_neighbors=20, contamination='auto') y_pred = clf.fit_predict(X) y_pred_outliers = y_pred[200:] # plot the level sets of the decision function xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Local Outlier Factor (LOF)") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) a = plt.scatter(X[:200, 0], X[:200, 1], c='white', edgecolor='k', s=20) b = plt.scatter(X[200:, 0], X[200:, 1], c='red', edgecolor='k', s=20) plt.axis('tight') plt.xlim((-5, 5)) plt.ylim((-5, 5)) plt.legend([a, b], ["normal observations", "abnormal observations"], loc="upper left") plt.show() # # Regression # # Example has been adapted from Haydar Ali Ismail's [Medium post](https://medium.com/@haydar_ai/learning-data-science-day-9-linear-regression-on-boston-housing-dataset-cd62a80775ef), titled "Learning Data Science: Day 9 - Linear Regression on Boston Housing Dataset". # In[ ]: import pandas as pd from sklearn.datasets import load_boston boston = load_boston() # In[ ]: boston # In[ ]: print(boston.keys()) # In[ ]: print(boston.data.shape) # In[ ]: print(boston.feature_names) # In[ ]: print(boston.DESCR) # In[ ]: bos = pd.DataFrame(boston.data) bos.columns = boston.feature_names # In[ ]: bos.head() # In[ ]: # The price boston.target # In[ ]: bos['PRICE'] = boston.target # In[ ]: bos.head() # ### Regression using a linear model # In[ ]: import sklearn from sklearn.model_selection import train_test_split X = bos.drop('PRICE', axis = 1) Y = bos['PRICE'] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 5) print(X_train.shape) print(X_test.shape) print(Y_train.shape) print(Y_test.shape) # In[ ]: from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_train, Y_train) Y_pred = model.predict(X_test) # In[ ]: import matplotlib.pyplot as plt plt.scatter(Y_test, Y_pred) plt.xlabel("True Prices") plt.ylabel("Predicted Prices") plt.title("True Prices vs Predicted prices") # In[ ]: mse = sklearn.metrics.mean_squared_error(Y_test, Y_pred) print(mse) # ### Regression using a Decision Tree # In[ ]: from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() model.fit(X_train, Y_train) Y_pred = model.predict(X_test) # In[ ]: import matplotlib.pyplot as plt plt.scatter(Y_test, Y_pred) plt. plt.xlabel("True Prices") plt.ylabel("Predicted Prices") plt.title("True Prices vs Predicted prices") # In[ ]: plt.bar(range(len(boston.feature_names)), model.feature_importances_) plt.xticks(range(13), boston.feature_names, rotation='vertical'); # ### Reminder of what the factors were # | Factor | Description | # | ------ | ------------| # | CRIM | per capita crime rate by town | # | ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | # | INDUS | proportion of non-retail business acres per town | # | CHAS | Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) | # | NOX | nitric oxides concentration (parts per 10 million) | # | RM | average number of rooms per dwelling | # | AGE | proportion of owner-occupied units built prior to 1940 | # | DIS | weighted distances to five Boston employment centres | # | RAD | index of accessibility to radial highways | # | TAX | full-value property-tax rate per \$10,000 | # | PTRATIO| pupil-teacher ratio by town | # | B | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | # | LSTAT | \% lower status of the population | # | MEDV | Median value of owner-occupied homes in $1000's | # In[ ]: