K-Nearest neighbors is one of the most basic classification algorithms and should be your first choice if you have no prior knowledge about the data.
It is based on the Euclidean distance between the test sample and the specified training samples.
We will use the digits dataset from sklearn to practice using KNN to predict the true value of hand written digits.
Lecture notes for theory and math is here: http://george1328.github.io/lecture_notes/KNN.pdf
import pandas as pd
%pylab inline
from sklearn.datasets import load_digits
digits = load_digits()
digits.keys()
print digits.DESCR
print digits.data.shape
print digits.images.shape
print digits.target.shape
#view some images
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# plot the digits: each image is 8x8 pixels
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
# label the image with the target value
ax.text(0, 7, str(digits.target[i]))
X = digits.data
y = digits.target
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=0)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(3).fit(X_train,y_train)
model.score(X_test, y_test)
from sklearn.cross_validation import KFold
def cross_validate(X, y, classifier, k_fold) :
# derive a set of (random) training and testing indices
k_fold_indices = KFold(len(X), n_folds=k_fold,
shuffle=True, random_state=0)
k_score_total = 0
# for each training and testing slices run the classifier, and score the results
for train_slice, test_slice in k_fold_indices :
model = classifier(X[ train_slice ],
y[ train_slice ])
k_score = model.score(X[ test_slice ],
y[ test_slice ])
k_score_total += k_score
# return the average accuracy
return k_score_total/k_fold
knn_values = np.array(range(2,20))
knn_results = []
for x in range(2,20):
knn_results.append(cross_validate(X, y, KNeighborsClassifier(x).fit, 10))
plot(knn_values, knn_results)