import pandas as pd
import numpy as np
import mglearn

from sklearn.datasets import fetch_california_housing
from sklearn.datasets import make_blobs

from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score

from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

from datetime import datetime

import matplotlib.pyplot as plt


mglearn.plots.plot_knn_classification(n_neighbors=3)


# Generate a dataset to work with
X, y = mglearn.datasets.make_forge()


# Split the data into training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


X_train

array([[ 8.92229526, -0.63993225],
       [ 8.7337095 ,  2.49162431],
       [ 9.32298256,  5.09840649],
       [ 7.99815287,  4.8525051 ],
       [11.0329545 , -0.16816717],
       [ 9.17748385,  5.09283177],
       [11.563957  ,  1.3389402 ],
       [ 9.15072323,  5.49832246],
       [ 8.34810316,  5.13415623],
       [11.93027136,  4.64866327],
       [ 8.1062269 ,  4.28695977],
       [ 8.67494727,  4.47573059],
       [ 9.67284681, -0.20283165],
       [ 9.50169345,  1.93824624],
       [ 8.69289001,  1.54322016],
       [ 9.96346605,  4.59676542],
       [ 9.50048972, -0.26430318],
       [ 9.25694192,  5.13284858],
       [ 8.68937095,  1.48709629]])


# Instantiate a scaler and use it to transform the X variables
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


print('Results of min-max scaling with MinMaxScaler:')
print(f'Train Scaled Max: {X_train_scaled.max(axis = 0)}')
print(f'Train Scaled Min: {X_train_scaled.min(axis = 0)}')
print(f'Test Scaled Max: {X_test_scaled.max(axis = 0)}')
print(f'Test Scaled Min: {X_test_scaled.min(axis = 0)}')

Results of min-max scaling with MinMaxScaler:
Train Scaled Max: [1. 1.]
Train Scaled Min: [0. 0.]
Test Scaled Max: [0.90114405 0.95321771]
Test Scaled Min: [0.04720805 0.26566392]


X_train_scaled_1 = (
    (X_train - np.min(X_train, axis = 0))/
    (np.max(X_train, axis = 0) - np.min(X_train, axis = 0))
)

X_test_scaled_1 = (
    (X_test - np.min(X_train, axis = 0))/
    (np.max(X_train, axis = 0) - np.min(X_train, axis = 0))
)

print('Results of manual min-max scaling:')
print(f'Train Scaled Max: {X_train_scaled_1.max(axis = 0)}')
print(f'Train Scaled Min: {X_train_scaled_1.min(axis = 0)}')
print(f'Test Scaled Max: {X_test_scaled_1.max(axis = 0)}')
print(f'Test Scaled Min: {X_test_scaled_1.min(axis = 0)}')

Results of manual min-max scaling:
Train Scaled Max: [1. 1.]
Train Scaled Min: [0. 0.]
Test Scaled Max: [0.90114405 0.95321771]
Test Scaled Min: [0.04720805 0.26566392]


# Instantiate the KNeighborsClassifier model
clf = KNeighborsClassifier(n_neighbors=3)


# Fit the model to the scaled training data
clf.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=3)

KNeighborsClassifier(n_neighbors=3)


# Take a look at the predictions on the test data
print("Test set predictions:", clf.predict(X_test))

Test set predictions: [1 0 1 0 1 0 0]


# Take a look at the accuracy on the test data
print(f"Test set accuracy: {clf.score(X_test_scaled, y_test):.2f}")

Test set accuracy: 0.86


# Take a look at the accuracy on the training data
print(f"Training set accuracy: {clf.score(X_train_scaled, y_train):.2f}")

Training set accuracy: 0.95


# Set value of k here:
k = 5

clf = KNeighborsClassifier(n_neighbors=k).fit(X, y)
mglearn.plots.plot_2d_separator(clf, X, fill=True, eps=0.5, alpha=.4)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.title("{} neighbor(s)".format(k))
plt.xlabel("feature 0")
plt.ylabel("feature 1");


# Set levels of k here:
k1, k2 = 1, 7

fig, axes = plt.subplots(1, 2, figsize=(10, 4))

for n_neighbors, ax in zip([k1, k2], axes):
    # the fit method returns the object self, so we can instantiate
    # and fit in one line
    clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
    mglearn.plots.plot_2d_separator(clf, X, fill=True, eps=0.5, ax=ax, alpha=.4)
    mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title("{} neighbor(s)".format(n_neighbors))
    ax.set_xlabel("feature 0")
    ax.set_ylabel("feature 1");


mglearn.plots.plot_knn_regression(n_neighbors=3)


# Create dataset to use
X, y = mglearn.datasets.make_wave(n_samples=40)

# split the dataset into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


# Set values of k:
k1, k2 = 1, 9

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
# create 1,000 data points, evenly spaced between -3 and 3
line = np.linspace(-3, 3, 1000).reshape(-1, 1)
for n_neighbors, ax in zip([k1, k2], axes):
    # make predictions using k1 or k2 neighbors
    reg = KNeighborsRegressor(n_neighbors=n_neighbors)
    reg.fit(X_train, y_train)
#     ax.plot(line, reg.predict(line), linestyle='solid', marker='.') 
    ax.plot(line, reg.predict(line), linestyle='None', marker='.') 
    # Try linestyle='None', marker=',', '.', or 'o' instead of 'solid' and ','
    ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8)
    ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8)

    ax.set_title(
        "{} neighbor(s)\n train score: {:.2f} test score: {:.2f}".format(
            n_neighbors, reg.score(X_train, y_train),
            reg.score(X_test, y_test)))
    ax.set_xlabel("Feature")
    ax.set_ylabel("Target")
axes[0].legend(["Model predictions", "Training data/target",
                "Test data/target"], loc="best");


ca = fetch_california_housing()

print("California data shape:", ca.data.shape)

California data shape: (20640, 8)


type(ca)

sklearn.utils._bunch.Bunch


ca.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])


print("California housing data features:", ca.feature_names)

California housing data features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


print(ca.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).

A household is a group of people residing within a home. Since the average
number of rooms and bedrooms in this dataset are provided per household, these
columns may take surprisingly large values for block groups with few households
and many empty houses, such as vacation resorts.

It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.

.. topic:: References

    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
      Statistics and Probability Letters, 33 (1997) 291-297


# Assign features and target to X and y
X = ca.data
y = ca.target


# Set k parameters to try and number of random splits
k_params = range(1, 16)
splits = 10

# Create matrices to hold results (Later will average for each level of k)
train_res = np.zeros(shape = (len(k_params), splits))
test_res = np.zeros(shape = (len(k_params), splits))


# Nested loop structure. Split in outer loop, loop through k values in inner loop
for split in range(splits):
    print(f"Split {split + 1} begun at: {datetime.now().strftime('%H:%M:%S')}")
    # Split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    # Scale the data
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    for idx, k in enumerate(k_params):
        #print(f"\tInner loop for k = {k} at: {datetime.now().strftime('%H:%M:%S')}")
        # Create the regression model
        kreg = KNeighborsRegressor(n_neighbors = k)
        # Fit the regression model to the training data
        kreg.fit(X_train_scaled, y_train)
        # Score the model on training data and add to results matrix
        train_res[idx, split] = kreg.score(X_train_scaled, y_train)
        # Score the model on testing data and add to results matrix
        test_res[idx, split] = kreg.score(X_test_scaled, y_test)

Split 1 begun at: 16:29:34
Split 2 begun at: 16:29:57
Split 3 begun at: 16:30:12
Split 4 begun at: 16:30:29
Split 5 begun at: 16:30:44
Split 6 begun at: 16:30:59
Split 7 begun at: 16:31:11
Split 8 begun at: 16:31:29
Split 9 begun at: 16:31:47
Split 10 begun at: 16:32:06


test_res

array([[0.57124487, 0.57193477, 0.57736424, 0.5660242 , 0.54699752,
        0.55836308, 0.56234234, 0.52760598, 0.54720095, 0.54070869],
       [0.6568089 , 0.66319512, 0.6685469 , 0.66310312, 0.63725812,
        0.65217929, 0.64034797, 0.64335164, 0.65004834, 0.62199637],
       [0.69176718, 0.68735176, 0.70137021, 0.69313521, 0.66517096,
        0.68342218, 0.67222414, 0.67462328, 0.68383452, 0.65917193],
       [0.70469892, 0.69888858, 0.71706331, 0.70566363, 0.6801516 ,
        0.69641067, 0.69042859, 0.68481444, 0.69538408, 0.67341649],
       [0.71115722, 0.70629394, 0.72099824, 0.71257938, 0.69281998,
        0.70762275, 0.69599995, 0.6926459 , 0.70443819, 0.68261958],
       [0.71745817, 0.71135151, 0.72171082, 0.71721704, 0.69734842,
        0.71018751, 0.70215909, 0.69880414, 0.70959369, 0.68813056],
       [0.71797995, 0.71329435, 0.72487138, 0.72027707, 0.69844044,
        0.71396668, 0.70228502, 0.69770926, 0.71005981, 0.68964527],
       [0.71451594, 0.71266779, 0.72494049, 0.721158  , 0.69641529,
        0.71426884, 0.70312983, 0.69852021, 0.71034668, 0.68945944],
       [0.71390694, 0.71337482, 0.72405984, 0.72174315, 0.69559311,
        0.71378202, 0.70320287, 0.69783645, 0.7102652 , 0.68799641],
       [0.71323223, 0.71339485, 0.72343651, 0.72109607, 0.69665628,
        0.7118084 , 0.70202909, 0.69700932, 0.71028156, 0.68966634],
       [0.71432852, 0.71362243, 0.72439866, 0.72084815, 0.69610252,
        0.71108463, 0.70359966, 0.69753412, 0.70814185, 0.69137938],
       [0.71327918, 0.71235254, 0.72319724, 0.71995588, 0.69538375,
        0.71081124, 0.70305607, 0.69740718, 0.70813811, 0.6908178 ],
       [0.71201349, 0.71252905, 0.72268328, 0.71825662, 0.69468954,
        0.71013962, 0.70324393, 0.69641268, 0.70691063, 0.69133408],
       [0.71043727, 0.71029394, 0.7212115 , 0.71775701, 0.69361195,
        0.70927653, 0.70319254, 0.69461032, 0.7065832 , 0.69148451],
       [0.71037325, 0.70977556, 0.72033535, 0.7179371 , 0.69288339,
        0.70867711, 0.70170677, 0.6939255 , 0.70649609, 0.69061048]])


# Put results into a dataframe
results = pd.DataFrame({'k': k_params,
                        'train_r2': train_res.mean(axis = 1),
                        'test_r2': test_res.mean(axis = 1)})


# View the results
results.sort_values(by = 'k', ascending = False)


# Find the k resulting in the highest Mean R-squared
results.iloc[results.test_r2.idxmax()]

k           7.000000
train_r2    0.782008
test_r2     0.708853
Name: 6, dtype: float64


# Create the fitting graph
plt.plot(results['k'], results['train_r2'])
plt.plot(results['k'], results['test_r2'])
plt.xlim(max(results['k']), min(results['k']))
plt.title("Fitting Graph for K-NN Regression")
plt.xlabel("# of Neighbors (K)")
plt.ylabel("R-Squared")
plt.legend(["Train Performance", "Test Performance"], loc="best");


from sklearn.compose import make_column_transformer

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV

# Put the features into a DataFrame
ca.feature_names
ca_df = pd.DataFrame(ca.data, columns = ca.feature_names)
ca_df.head()

features = ca_df
target = ca.target

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=0)


# A column transformer does various data pre-processing steps
ct = make_column_transformer(
    (MinMaxScaler(), ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 
                      'Population', 'AveOccup', 'Latitude', 'Longitude'])
)

# A pipeline puts the preprocessing and model building together into a sequence
pipe = Pipeline([('preprocessing', ct),
                ('knn', KNeighborsRegressor())])

# Specify the parameters to be tested
param_grid = {'knn__n_neighbors': np.arange(1, 18)}


# Create an object to define a number of random train-test splits
shuffle_split = ShuffleSplit(test_size=.25,
                             train_size=.75,
                             n_splits=10)

# Define a grid search using the defined pipe, parameter grid, and random splits
grid_search = GridSearchCV(pipe, param_grid, cv=shuffle_split, return_train_score = True)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=None, test_size=0.25, train_size=0.75),
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('minmaxscaler',
                                                                         MinMaxScaler(),
                                                                         ['MedInc',
                                                                          'HouseAge',
                                                                          'AveRooms',
                                                                          'AveBedrms',
                                                                          'Population',
                                                                          'AveOccup',
                                                                          'Latitude',
                                                                          'Longitude'])])),
                                       ('knn', KNeighborsRegressor())]),
             param_grid={'knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])},
             return_train_score=True)

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=None, test_size=0.25, train_size=0.75),
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('minmaxscaler',
                                                                         MinMaxScaler(),
                                                                         ['MedInc',
                                                                          'HouseAge',
                                                                          'AveRooms',
                                                                          'AveBedrms',
                                                                          'Population',
                                                                          'AveOccup',
                                                                          'Latitude',
                                                                          'Longitude'])])),
                                       ('knn', KNeighborsRegressor())]),
             param_grid={'knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])},
             return_train_score=True)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['MedInc', 'HouseAge',
                                                   'AveRooms', 'AveBedrms',
                                                   'Population', 'AveOccup',
                                                   'Latitude',
                                                   'Longitude'])])),
                ('knn', KNeighborsRegressor())])

ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                 ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms',
                                  'Population', 'AveOccup', 'Latitude',
                                  'Longitude'])])

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

MinMaxScaler()

KNeighborsRegressor()


# The grid search object has various properties and methods
print("Best estimator:\n{}".format(grid_search.best_estimator_))

print("Best parameters: {}".format(grid_search.best_params_))

print("Test set score: {:.3f}".format(grid_search.score(X_test, y_test)))

Best estimator:
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['MedInc', 'HouseAge',
                                                   'AveRooms', 'AveBedrms',
                                                   'Population', 'AveOccup',
                                                   'Latitude',
                                                   'Longitude'])])),
                ('knn', KNeighborsRegressor(n_neighbors=9))])
Best parameters: {'knn__n_neighbors': 9}
Test set score: 0.697


# convert to Dataframe
results = pd.DataFrame(grid_search.cv_results_)


results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   mean_fit_time           17 non-null     float64
 1   std_fit_time            17 non-null     float64
 2   mean_score_time         17 non-null     float64
 3   std_score_time          17 non-null     float64
 4   param_knn__n_neighbors  17 non-null     object 
 5   params                  17 non-null     object 
 6   split0_test_score       17 non-null     float64
 7   split1_test_score       17 non-null     float64
 8   split2_test_score       17 non-null     float64
 9   split3_test_score       17 non-null     float64
 10  split4_test_score       17 non-null     float64
 11  split5_test_score       17 non-null     float64
 12  split6_test_score       17 non-null     float64
 13  split7_test_score       17 non-null     float64
 14  split8_test_score       17 non-null     float64
 15  split9_test_score       17 non-null     float64
 16  mean_test_score         17 non-null     float64
 17  std_test_score          17 non-null     float64
 18  rank_test_score         17 non-null     int32  
 19  split0_train_score      17 non-null     float64
 20  split1_train_score      17 non-null     float64
 21  split2_train_score      17 non-null     float64
 22  split3_train_score      17 non-null     float64
 23  split4_train_score      17 non-null     float64
 24  split5_train_score      17 non-null     float64
 25  split6_train_score      17 non-null     float64
 26  split7_train_score      17 non-null     float64
 27  split8_train_score      17 non-null     float64
 28  split9_train_score      17 non-null     float64
 29  mean_train_score        17 non-null     float64
 30  std_train_score         17 non-null     float64
dtypes: float64(28), int32(1), object(2)
memory usage: 4.2+ KB


# Take a look at key parts of the results
(
    results.loc[:, ['param_knn__n_neighbors', 'mean_train_score', 
                'mean_test_score', 'rank_test_score']]
    .sort_values(by = 'param_knn__n_neighbors', ascending = False)
)


# Create the fitting graph
plt.plot(results['param_knn__n_neighbors'], results['mean_train_score'])
plt.plot(results['param_knn__n_neighbors'], results['mean_test_score'])
plt.xlim(max(results['param_knn__n_neighbors']), min(results['param_knn__n_neighbors']))
plt.title("Fitting Graph for K-NN Regression")
plt.xlabel("# of Neighbors (K)")
plt.ylabel("R-Squared")
plt.legend(["Train Performance", "Test Performance"], loc="best");

	k	train_r2	test_r2
14	15	0.742191	0.705272
13	14	0.745486	0.705846
12	13	0.748870	0.706821
11	12	0.752858	0.707440
10	11	0.757076	0.708104
9	10	0.762191	0.707861
8	9	0.767842	0.708176
7	8	0.774534	0.708542
6	7	0.782008	0.708853
5	6	0.790715	0.707396
4	5	0.801941	0.702718
3	4	0.818127	0.694692
2	3	0.842875	0.681207
1	2	0.886407	0.649684
0	1	1.000000	0.556979

	param_knn__n_neighbors	mean_train_score	mean_test_score	rank_test_score
16	17	0.731679	0.701505	13
15	16	0.734306	0.702431	12
14	15	0.737405	0.703272	10
13	14	0.740728	0.704099	9
12	13	0.744546	0.704924	8
11	12	0.748627	0.706060	6
10	11	0.753026	0.706891	5
9	10	0.758118	0.707918	3
8	9	0.763607	0.708190	1
7	8	0.770570	0.707985	2
6	7	0.778313	0.707078	4
5	6	0.788341	0.705718	7
4	5	0.800410	0.702874	11
3	4	0.816322	0.695433	14
2	3	0.840565	0.681366	15
1	2	0.884693	0.650012	16
0	1	1.000000	0.546933	17

K-Nearest Neighbors Method and Example Fitting Graph¶

Intro to Machine Learning and Scikit-Learn¶

This notebook does the following:¶

Module and Function Imports¶

K-Nearest Neighbors Method (K-nn)¶

K-nn Models for Classification¶

Graphical Illustration of K-nn Classification with various values for $k$.¶

Steps for Running One Iteration of a K-nn Classification Model in Scikit-Learn¶

KNeighborsClassifier Decision Boundaries at Different Levels of $k$¶

Show Two Levels of k at same time¶

K-nn Models for Regression¶

Running a K-nn Regression Model in Scikit-Learn¶

KNeighborsRegressor Model Predictions at Different Levels of $k$¶

K-nn Models Summary¶

K-nn Regression Fitting Graph Exercise¶

K-nn Regression on California Housing Data¶

Run the code in the cells below to load and investigate the dataset.¶

Find $k$ That Leads to Best Estimated Generalization Performance¶

Plot the Fitting Graph¶

Find Best $k$ Again, But Use More of Scikit-Learn's Tools¶