python实现k-means算法
时间:2022-10-18 11:53:11|栏目:Python代码|点击: 次
聚类属于无监督学习,K-means
算法是很典型的基于距离的聚类算法,采用距离作为相似性的评价指标,即认为两个对象的距离越近,其相似度就越大。该算法认为簇是由距离靠近的对象组成的,因此把得到紧凑且独立的簇作为最终目标。
下面来看看python实现k-means算法的详细代码吧:
# -*- coding:utf-8 -*- import random import numpy as np from matplotlib import pyplot class K_Means(object): # k是分组数;tolerance‘中心点误差';max_iter是迭代次数 def __init__(self, k=2, tolerance=0.0001, max_iter=300): self.k_ = k self.tolerance_ = tolerance self.max_iter_ = max_iter def fit(self, data): self.centers_ = {} for i in range(self.k_): self.centers_[i] = data[random.randint(0,len(data))] # print('center', self.centers_) for i in range(self.max_iter_): self.clf_ = {} #用于装归属到每个类中的点[k,len(data)] for i in range(self.k_): self.clf_[i] = [] # print("质点:",self.centers_) for feature in data: distances = [] #装中心点到每个点的距离[k] for center in self.centers_: # 欧拉距离 distances.append(np.linalg.norm(feature - self.centers_[center])) classification = distances.index(min(distances)) self.clf_[classification].append(feature) # print("分组情况:",self.clf_) prev_centers = dict(self.centers_) for c in self.clf_: self.centers_[c] = np.average(self.clf_[c], axis=0) # '中心点'是否在误差范围 optimized = True for center in self.centers_: org_centers = prev_centers[center] cur_centers = self.centers_[center] if np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_: optimized = False if optimized: break def predict(self, p_data): distances = [np.linalg.norm(p_data - self.centers_[center]) for center in self.centers_] index = distances.index(min(distances)) return index if __name__ == '__main__': x = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]]) k_means = K_Means(k=2) k_means.fit(x) for center in k_means.centers_: pyplot.scatter(k_means.centers_[center][0], k_means.centers_[center][1], marker='*', s=150) for cat in k_means.clf_: for point in k_means.clf_[cat]: pyplot.scatter(point[0], point[1], c=('r' if cat == 0 else 'b')) predict = [[2, 1], [6, 9]] for feature in predict: cat = k_means.predict(feature) pyplot.scatter(feature[0], feature[1], c=('r' if cat == 0 else 'b'), marker='x') pyplot.show()