The k-Nearest Neighbors (kNN)
examples/ml/abalone.py
import requests import os import shutil import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error from math import sqrt import seaborn as sns from sklearn.model_selection import GridSearchCV from sklearn.ensemble import BaggingRegressor def get_files(): data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data" names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names" filenames = [] for url in (data_url, names_url): filename = url.split('/')[-1] filenames.append(filename) if not os.path.exists(filename): with requests.get(url, stream=True) as response: with open(filename, 'wb') as fh: shutil.copyfileobj(response.raw, fh) return filenames if __name__ == "__main__": data_file, names_file = get_files() columns = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"] df = pd.read_csv(data_file, names=columns) #print(df.head()) df = df.drop("Sex", axis=1) #print(df.head()) #df["Rings"].hist(bins=15) #plt.show() #correlation_matrix = df.corr() #print(correlation_matrix["Rings"]) X = df.drop("Rings", axis=1) X = X.values y = df["Rings"] y = y.values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) knn_model = KNeighborsRegressor(n_neighbors=3).fit(X_train, y_train) train_predictions = knn_model.predict(X_train) train_mse = mean_squared_error(y_train, train_predictions) train_rmse = sqrt(train_mse) print(train_rmse) # 1.67 test_predictions = knn_model.predict(X_test) test_mse = mean_squared_error(y_test, test_predictions) test_rmse = sqrt(test_mse) print(test_rmse) # 2.36 # That is the number of years as errors between the prediction and the actual value # This looks like overfitting # cmap = sns.cubehelix_palette(as_cmap=True) # f, ax = plt.subplots() # # Length and Diameter, the two columns with strong correllation # points = ax.scatter(X_test[:, 0], X_test[:, 1], c=test_predictions, s=50, cmap=cmap) # f.colorbar(points) # plt.show() # cmap = sns.cubehelix_palette(as_cmap=True) # f, ax = plt.subplots() # # Length and Diameter, the two columns with strong correllation # points = ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=50, cmap=cmap) # f.colorbar(points) # plt.show() # Tuning Hypermarameters # What should be the value of k ? k = 1 means you depend too much on a potentially outlier neighbour # If k is all the neighbours then for every prediction you will get the same answer. # Look for the best value for k in the range of 1-50 # parameters = {"n_neighbors": range(1, 50)} # gridsearch = GridSearchCV(KNeighborsRegressor(), parameters) # gscv = gridsearch.fit(X_train, y_train) # # print(gscv) # print(gridsearch.best_params_) # {'n_neighbors': 17} # train_preds_grid = gridsearch.predict(X_train) # train_mse = mean_squared_error(y_train, train_preds_grid) # train_rmse = sqrt(train_mse) # test_preds_grid = gridsearch.predict(X_test) # test_mse = mean_squared_error(y_test, test_preds_grid) # test_rmse = sqrt(test_mse) # print(train_rmse) # print(test_rmse) # Weighted Average of Neighbors Based on Distance parameters = { "n_neighbors": range(1, 50), "weights": ["uniform", "distance"], } gridsearch = GridSearchCV(KNeighborsRegressor(), parameters) gridsearch.fit(X_train, y_train) # print(gridsearch.best_params_) # {'n_neighbors': 17} # train_preds_grid = gridsearch.predict(X_train) # train_mse = mean_squared_error(y_train, train_preds_grid) # train_rmse = sqrt(train_mse) # test_preds_grid = gridsearch.predict(X_test) # test_mse = mean_squared_error(y_test, test_preds_grid) # test_rmse = sqrt(test_mse) # print(train_rmse) # print(test_rmse) best_k = gridsearch.best_params_["n_neighbors"] best_weights = gridsearch.best_params_["weights"] bagged_knn = KNeighborsRegressor( n_neighbors=best_k, weights=best_weights ) bagging_model = BaggingRegressor(bagged_knn, n_estimators=100) bagging_model.fit(X_train, y_train) train_preds_grid = bagging_model.predict(X_train) train_mse = mean_squared_error(y_train, train_preds_grid) train_rmse = sqrt(train_mse) test_preds_grid = bagging_model.predict(X_test) test_mse = mean_squared_error(y_test, test_preds_grid) test_rmse = sqrt(test_mse) print(train_rmse) print(test_rmse)