基于Python的实战聚类代码(Kmeans和SOM)
在肘部法则中,我们关注的是随着簇数增加,Inertia(簇内平方和)的变化情况。通常情况下,Inertia值越低表示数据点更接近其簇中心,但并非绝对。在肘部法则中,我们寻找的是Inertia急剧下降并趋于平缓的点,这个点可能是最佳的聚类数量。因此,对于肘部法则,我们要找到Inertia值下降幅度显著变小,即形成一个“肘部”样的点,这时候增加簇数不再显著地降低Inertia。
(1)Kmeans
基于的是scikit-learn库中的K-means包,官网链接:
sklearn.cluster.KMeans — scikit-learn 1.3.2 documentation
直接上实战代码:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
from scipy import interpolate
from sklearn.cluster import KMeans
#===========================读取数据
num_clusters = 3 #设定聚类的类别数
Data = np.load('Precip.npz') #这是自己生成的数据 可以自行替换
precip = Data['precip'][:,:,3] #聚类数据是2维 [samples, variables]
preciptime = Data['preciptime']
# ====实战的数据降水里面有部分数据是缺测 32700.00 插值一下 这部分可注释
precip[precip >= 1000] = np.nan
precipInterp = []
precipInterpTime = []
for id1 in range(precip.shape[0]):
precipEach = pd.DataFrame(precip[id1])
precipEach= precipEach.interpolate(method='linear', limit_direction='both')
precipEach_ = precipEach.values.squeeze() # (712)
if np.sum(np.isnan(precipEach_)) == 0:
precipInterp.append(precipEach_)
precipInterpTime.append(preciptime)
precipInterp = np.array(precipInterp)
precipInterpTime = np.array(precipInterpTime)
precipInterpAnomaly = precipInterp - np.mean(precipInterp, axis = 0)
precipInterpClimatology = np.mean(precipInterp, axis=0)
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init='auto', max_iter=300, tol=0.0001, verbose=0, random_state=6, copy_x=True, algorithm='lloyd')
kmeans.fit(precipInterpAnomaly)
# 获取聚类的中心
cluster_centers = kmeans.cluster_centers_
# 获取你要的数据最终聚类的类别
cluster_labels = kmeans.predict(precipInterpAnomaly)
#===下面这段代码是分析聚类的各类之间相关性,这部分可注释
#==计算两两相关
import seaborn as sns
correlation_matrix = np.corrcoef(cluster_centers)
# 绘制相关性矩阵
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Clusters')
plt.savefig('Correlation_%sCluster_Kmeans_Anomaly.png'%num_clusters)
对于Kmeans聚类,其中一个关键是聚类数K如何确定:(1)根据经验确定一个合适的K。(2)根据肘部法则:在肘部法则中,我们关注的是随着簇数增加,Inertia(簇内平方和)的变化情况。通常情况下,Inertia值越低表示数据点更接近其簇中心,但并非绝对。在肘部法则中,我们寻找的是Inertia急剧下降并趋于平缓的点,这个点可能是最佳的聚类数量。因此,对于肘部法则,我们要找到Inertia值下降幅度显著变小,即形成一个“肘部”样的点,这时候增加簇数不再显著地降低Inertia。
代码:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
from scipy import interpolate
def GetKByInertia(ks):
'''
在肘部法则中,我们关注的是随着簇数增加,Inertia(簇内平方和)的变化情况。通常情况下,Inertia值越低表示数据点更接近其簇中心,但并非绝对。
在肘部法则中,我们寻找的是Inertia急剧下降并趋于平缓的点,这个点可能是最佳的聚类数量。
因此,对于肘部法则,我们要找到Inertia值下降幅度显著变小,即形成一个“肘部”样的点,这时候增加簇数不再显著地降低Inertia。
ks: 从1-ks类中挑选一个最优的
'''
#===========================读取测站数据
Data = np.load('Precip.npz') #站点降水单位 mm
precip = Data['precip'][:,:,3]
preciptime = Data['preciptime']
# ====降水里面有部分数据是缺测 32700.00 插值一下
precip[precip >= 1000] = np.nan
precipInterp = []
precipInterpTime = []
for id1 in range(precip.shape[0]):
precipEach = pd.DataFrame(precip[id1])
precipEach= precipEach.interpolate(method='linear', limit_direction='both')
precipEach_ = precipEach.values.squeeze() # (712)
if np.sum(np.isnan(precipEach_)) == 0:
precipInterp.append(precipEach_)
precipInterpTime.append(preciptime)
precipInterp = np.array(precipInterp)
precipInterpTime = np.array(precipInterpTime)
precipInterpAnomaly = precipInterp - np.mean(precipInterp, axis = 0)
# 计算不同聚类数量下的簇内平方和(inertia)
inertia = []
for i in range(1, ks): # 尝试不同数量的簇,这里设置为尝试 1 到 10 个簇
t0 = time.time()
kmeans = KMeans(n_clusters=i, n_init='auto')
kmeans.fit(precipInterpAnomaly)
inertia.append(kmeans.inertia_)
t1 = time.time()
print('%s is done! | 耗时%.3f' % (i, (t1-t0)/60))
# 使用肘部法则找到最佳的聚类数量
plt.plot(range(1, ks, 1), inertia)
plt.xticks(range(1, ks, 1))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.savefig('Kmeans_Inertia.png')
(2)SOM
自组织特征映射网络SOFM又称自组织映射网络SOM,是一种自组织竞争神经网络,一个神经网络接受外界输入模式时,将会分为不同的对应区域,各区域对输入模式具有不同的响应特征,而且这个过程是自动完成的。
本代码是基于minisom完成的,其官网为:https://github.com/JustGlowing/minisom/tree/master/examples
跟Kmeans的数据一样,直接上代码:
if __name__ == "__main__":
import numpy as np
import xarray as xr
import pandas as pd
from scipy import interpolate
from datetime import datetime, timedelta
from minisom import MiniSom
import matplotlib.pyplot as plt
#===========================读取测站数据
Data = np.load('Precip.npz') #站点降水单位 mm
precip = Data['precip'][:,:,3]
preciptime = Data['preciptime']
# ====降水里面有部分数据是缺测 32700.00 插值一下
precip[precip >= 1000] = np.nan
precipInterp = []
precipInterpTime = []
for id1 in range(precip.shape[0]):
precipEach = pd.DataFrame(precip[id1])
precipEach= precipEach.interpolate(method='linear', limit_direction='both')
precipEach_ = precipEach.values.squeeze() # (712)
if np.sum(np.isnan(precipEach_)) == 0:
precipInterp.append(precipEach_)
precipInterpTime.append(preciptime)
precipInterp = np.array(precipInterp)
precipInterpTime = np.array(precipInterpTime)
precipInterpAnomaly = precipInterp - np.mean(precipInterp, axis = 0)
som_shape = (2, 2) #SOM网络形状,其实就是分多少类
ClusterNums = int(som_shape[0]*som_shape[1])
som = MiniSom(som_shape[0], som_shape[1], precipInterpAnomaly.shape[1], sigma=.5, learning_rate=.5,
neighborhood_function='gaussian', random_seed=10)
# 随机初始化权重
som.random_weights_init(precipInterpAnomaly)
# 开始训练 SOM
num_iterations = 10000 # 迭代次数
som.train_random(precipInterpAnomaly, num_iterations)
# # 获得每个站点在 SOM 中的位置
# cluster_mappings = som.win_map(precipInterpAnomaly)
# each neuron represents a cluster
winner_coordinates = np.array([som.winner(x) for x in precipInterpAnomaly]).T
# with np.ravel_multi_index we convert the bidimensional
# coordinates to a monodimensional index
cluster_index = np.ravel_multi_index(winner_coordinates, som_shape)
#=====将每一类聚类 然后样本进行平均
cluster_means = []
for cluster_num in range(ClusterNums):
cluster_data = precipInterpAnomaly[cluster_index == cluster_num]
cluster_mean = np.mean(cluster_data, axis=0)
cluster_means.append(cluster_mean)
cluster_means = np.array(cluster_means)
LatStation = Data['precip'][0, :, 1]
LonStation = Data['precip'][0, :, 2]
for id1 in range(cluster_means.shape[0]):
SavePathEach = 'ClusterByMeanSamples%sof%s_ByMiniSom.png' % (id1+1, ClusterNums)
PlotStationData(cluster_means[id1], LatStation, LonStation, SavePathEach)
更多推荐
所有评论(0)