## Overview

K近邻值算法 KNN (K — Nearest Neighbors) 是一种机器学习中的分类算法；K-NN是一种 非参数 的 惰性学习算法 。非参数意味着没有对基础数据分布的假设，即模型结构是从数据集确定的。

## 如何工作

KNN 算法是通过计算新对象与训练数据集中所有对象之间的距离，对新实例进行分类或回归预测。然后选择训练数据集中距离最小的 K 个示例，并通过平均结果进行预测。

## 如何计算距离

### 汉明距离

applebananapineapple
100
010
001

100 表示苹果，100就是苹果的二进制向量

010 表示香蕉，010就是香蕉的二进制向量

red = [1, 0, 0]
green = [0, 1, 0]
blue = [0, 0, 1]

$$Hamming Distance d(a, b)\ =\ sum(xi\ !=\ yi\ for\ xi,\ yi\ in\ zip(x, y))$$

def hammingDistance(a, b):
if len(a) != len(b):
raise ValueError("Undefined for sequences of unequal length.")
return sum(abs(e1 - e2) for e1, e2 in zip(a, b))
row1 = [0, 0, 0, 0, 0, 1]
row2 = [0, 0, 0, 0, 1, 0]
dist = hammingDistance(row1, row2)
print(dist)

from scipy.spatial.distance import hamming
# define data
row1 = [0, 0, 0, 0, 0, 1]
row2 = [0, 0, 0, 0, 1, 0]
# calculate distance
dist = hamming(row1, row2)
print(dist)

### 欧几里得距离

$$EuclideanDistance=\sqrt[]{\sum(a-b)^2}$$

$$EuclideanDistance = sum\ for\ i\ to\ N\ (v1[i]\ –\ v2[i])^2$$

# calculating euclidean distance between vectors
from math import sqrt
from scipy.spatial.distance import euclidean
# calculate euclidean distance
def euclidean_distance(a, b):
return sqrt(sum((e1-e2)**2 for e1, e2 in zip(a,b)))

# define data
row1 = [10, 20, 15, 10, 5]
row2 = [12, 24, 18, 8, 7]
# calculate distance
dist = euclidean_distance(row1, row2)
print(dist)
print(euclidean(row1, row2))

### 曼哈顿距离

$$Manhattandistance\ d(x,y)=\left|x_{1}-x_{2}\right|+\left|y_{1}-y_{2}\right|$$

$$King=|6-8|+|6-1| = 7$$

python中的公式可以表示为 ： sum(abs(val1-val2) for val1, val2 in zip(a,b))

from scipy.spatial.distance import cityblock
# calculate manhattan distance
def manhattan_distance(a, b):
return sum(abs(e1-e2) for e1, e2 in zip(a,b))

# define data
row1 = [10, 20, 15, 10, 5]
row2 = [12, 24, 18, 8, 7]
# calculate distance
dist = manhattan_distance(row1, row2)
print(dist)
print(cityblock(row1, row2))

### 闵可夫斯基距离

(sum for i to N (abs(v1[i] – v2[i]))^p)^(1/p)

p 是一个有序的参数，当 $$p=1$$ 时，计算的是曼哈顿距离。当 $$p=2$$ 时，计算的是欧几里得距离。

# calculating minkowski distance between vectors
from scipy.spatial import minkowski_distance

# calculate minkowski distance
def minkowski_distance(a, b, p):
return sum(abs(e1-e2)**p for e1, e2 in zip(a,b))**(1/p)

# define data
row1 = [10, 20, 15, 10, 5]
row2 = [12, 24, 18, 8, 7]
# 手动实现的算法用来使用闵可夫斯基计算距离
dist = minkowski_distance(row1, row2, 1)
# 1为曼哈顿
print(dist)
# 1为欧几里得
dist = minkowski_distance(row1, row2, 2)
print(dist)
# 使用包 scipy.spatial来计算
print(minkowski_distance(row1, row2, 1))
print(minkowski_distance(row1, row2, 2))

## KNN算法实现

KNN在实现起来主要有三个步骤：

### 计算距离

def euclidean_distance(row1, row2):
distance = 0.0
for i in range(len(row1)-1):
distance += (row1[i] - row2[i])**2
return sqrt(distance)

X1X2Y
2.78108362.5505370030
1.4654893722.3621250760
3.3965616884.4002935290
1.388070191.8502203170
3.064072323.0053059730
7.6275312142.7592622351
5.3324412482.0886267751
6.9225967161.771063671
8.675418651-0.2420686551
7.6737564663.5085630111

from math import sqrt

# 欧几里得距离，计算两个向量间距离的算法
def euclidean_distance(row1, row2):
distance = 0.0
for i in range(len(row1)-1):
distance += (row1[i] - row2[i])**2 # 平方差
return sqrt(distance) # 平方根

# 测试数据集
dataset = [
[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]
]
row0 = dataset[0]
for row in dataset:
distance = euclidean_distance(row0, row)
print(distance)

# 0.0
# 1.3290173915275787
# 1.9494646655653247
# 1.5591439385540549
# 0.5356280721938492
# 4.850940186986411
# 2.592833759950511
# 4.214227042632867
# 6.522409988228337
# 4.985585382449795

### 获取最近邻居

# 找到最近的邻居
def get_neighbors(train, test_row, num_neighbors):
"""
计算训练集train中所有元素到test_row的距离
:param train: list, 数据集，可以是训练集
:param test_row: list, 新的实例，也就是K
:param num_neighbors:int，需要多少个邻居
:return: None
"""
distances = list()
for train_row in train:
# 计算出每一行的距离，把他添加到元组中
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda knn: knn[1]) # 根据元素哪个字段进行排序
neighbors = list()
for i in range(num_neighbors):
neighbors.append(distances[i][0])
return neighbors

from math import sqrt

# 欧几里得距离，计算两个向量间距离的算法
def euclidean_distance(row1, row2):
distance = 0.0
for i in range(len(row1)-1):
distance += (row1[i] - row2[i])**2 # 平方差
return sqrt(distance) # 平方根

# 找到最近的邻居
def get_neighbors(train, test_row, num_neighbors):
"""
计算训练集train中所有元素到test_row的距离
:param train: list, 数据集，可以是训练集
:param test_row: list, 新的实例，也就是K
:param num_neighbors:int，需要多少个邻居
:return: None
"""
distances = list()
for train_row in train:
# 计算出每一行的距离，把他添加到元组中
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda knn: knn[1]) # 根据元素哪个字段进行排序
neighbors = list()
for i in range(num_neighbors):
neighbors.append(distances[i][0])
return neighbors
# 测试数据集
dataset = [
[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]
]
neighbors = get_neighbors(dataset, dataset[0], 3)
for neighbor in neighbors:
print(neighbor)
# [2.7810836, 2.550537003, 0]
# [3.06407232, 3.005305973, 0]
# [1.465489372, 2.362125076, 0]

### 预测结果

# 预测值
def predict_classification(train, test_row, num_neighbors):
"""
计算训练集train中所有元素到test_row的距离
:param train: list, 数据集，可以是训练集
:param test_row: list, 新的实例，也就是K
:param num_neighbors:int，需要多少个邻居
:return: None
"""
neighbors = get_neighbors(train, test_row, num_neighbors)
output_values = [row[-1] for row in neighbors] # 拿到所属类的真实类别
prediction = max(set(output_values), key=output_values.count)  #算出邻居类别最大的数量
return prediction

from math import sqrt

# 欧几里得距离，计算两个向量间距离的算法
def euclidean_distance(row1, row2):
distance = 0.0
for i in range(len(row1)-1):
distance += (row1[i] - row2[i])**2 # 平方差
return sqrt(distance) # 平方根

# 找到最近的邻居
def get_neighbors(train, test_row, num_neighbors):
"""
计算训练集train中所有元素到test_row的距离
:param train: list, 数据集，可以是训练集
:param test_row: list, 新的实例，也就是K
:param num_neighbors:int，需要多少个邻居
:return: None
"""
distances = list()
for train_row in train:
# 计算出每一行的距离，把他添加到元组中
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda knn: knn[1]) # 根据元素哪个字段进行排序
neighbors = list()
for i in range(num_neighbors):
neighbors.append(distances[i][0])
return neighbors
# 预测值
def predict_classification(train, test_row, num_neighbors):
"""
计算训练集train中所有元素到test_row的距离
:param train: list, 数据集，可以是训练集
:param test_row: list, 新的实例，也就是K
:param num_neighbors:int，需要多少个邻居
:return: None
"""
neighbors = get_neighbors(train, test_row, num_neighbors)
output_values = [row[-1] for row in neighbors] # 拿到所属类的真实类别
prediction = max(set(output_values), key=output_values.count)  #算出邻居类别最大的数量
return prediction
# 测试数据集
dataset = [
[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]
]
for n in range(len(dataset)):
prediction = predict_classification(dataset, dataset[n], 5)
print('Expected %d, Got %d.' % (dataset[n][-1], prediction))
# Expected 0, Got 0.
# Expected 0, Got 0.
# Expected 0, Got 0.
# Expected 0, Got 0.
# Expected 0, Got 0.
# Expected 1, Got 1.
# Expected 1, Got 1.
# Expected 1, Got 1.
# Expected 1, Got 1.
# Expected 1, Got 1.

## 鸢尾花种实例

### Start

from random import seed
from random import randrange
from math import sqrt
# 加载CSV
dataset = list()
with open(filename, 'r') as file:
if not row:
continue
dataset.append(row)
return dataset
# 转换所有的值为float方便运算
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# 转换所有的类型为int
def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup
# # k-folds CV函数进行划分
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
# 平均分成n_folds折数
fold_size = int(len(dataset) / n_folds)
for _ in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# 计算精确度
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# 评估算法
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
"""
评估算法，计算算法的精确度
:param dataset: list, 数据集
:param algorithm: function, 算法名
:param n_folds: int，折数
:param args: 用于algorithm的参数
:return: None
"""
folds = cross_validation_split(dataset, n_folds) # 分成5折
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold) # 训练集不包含本身
train_set = sum(train_set, [])
test_set = list() # 测试集
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
# 欧几里得距离，计算两个向量间距离的算法
def euclidean_distance(row1, row2):
distance = 0.0
for i in range(len(row1)-1):
distance += (row1[i] - row2[i])**2
return sqrt(distance)
# 确定最邻近的邻居
def get_neighbors(train, test_row, num_neighbors):
"""
计算训练集train中所有元素到test_row的距离
:param train: list, 数据集，可以是训练集
:param test_row: list, 新的实例，也就是K
:param num_neighbors:int，需要多少个邻居
:return: None
"""
distances = list()
for train_row in train:
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda tup: tup[1])
neighbors = list()
for i in range(num_neighbors):
neighbors.append(distances[i][0])
return neighbors
# 与临近值进行比较并预测
def predict_classification(train, test_row, num_neighbors):
"""
计算训练集train中所有元素到test_row的距离
:param train: list, 数据集，可以是训练集
:param test_row: list, 新的实例，也就是K
:param num_neighbors:int，需要多少个邻居
:return: None
"""
neighbors = get_neighbors(train, test_row, num_neighbors)
output_values = [row[-1] for row in neighbors]
prediction = max(set(output_values), key=output_values.count)
return prediction
# kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
predictions = list()
for row in test:
output = predict_classification(train, row, num_neighbors)
predictions.append(output)
return(predictions)
# 使用KNN算法计算鸢尾花数据集
seed(1)
filename = 'iris.csv'
for i in range(len(dataset[0])-1):
str_column_to_float(dataset, i)
# 转换类型为int
str_column_to_int(dataset, len(dataset[0])-1)
# 评估算法
n_folds = 5 # 5折
num_neighbors = 5 #取5个邻居
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
# Scores: [96.66666666666667, 96.66666666666667, 100.0, 90.0, 100.0]
# Mean Accuracy: 96.667%

for i, value in enumerate(unique):
lookup[value] = i
print('[%s] => %d' % (value, i))

# 原来的整个数据集打分不需要了
# scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
# print('Scores: %s' % scores)
# print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
# 定义一个新数据
row = [5.7,2.9,4.2,1.3]
label = predict_classification(dataset, row, num_neighbors)
print('Data=%s, Predicted: %s' % (row, label))
# Data=[5.7, 2.9, 4.2, 1.3], Predicted: 1

Reference

distance measures

k nearest neighbors implement