基于Python的实战聚类代码（Kmeans和SOM）

在肘部法则中，我们关注的是随着簇数增加，Inertia（簇内平方和）的变化情况。通常情况下，Inertia值越低表示数据点更接近其簇中心，但并非绝对。在肘部法则中，我们寻找的是Inertia急剧下降并趋于平缓的点，这个点可能是最佳的聚类数量。因此，对于肘部法则，我们要找到Inertia值下降幅度显著变小，即形成一个“肘部”样的点，这时候增加簇数不再显著地降低Inertia。

可乐要加冰_ice

1669人浏览 · 2023-11-27 12:17:11

可乐要加冰_ice · 2023-11-27 12:17:11 发布

（1）Kmeans

基于的是scikit-learn库中的K-means包，官网链接:

sklearn.cluster.KMeans — scikit-learn 1.3.2 documentation

直接上实战代码：

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import  xarray as xr
import pandas as pd
from scipy import interpolate
from sklearn.cluster import KMeans
#===========================读取数据
num_clusters = 3  #设定聚类的类别数
Data = np.load('Precip.npz') #这是自己生成的数据 可以自行替换
precip = Data['precip'][:,:,3] #聚类数据是2维 [samples, variables]
preciptime = Data['preciptime']
# ====实战的数据降水里面有部分数据是缺测 32700.00 插值一下 这部分可注释
precip[precip  >= 1000] = np.nan
precipInterp = []
precipInterpTime = []
for id1 in range(precip.shape[0]):
    precipEach = pd.DataFrame(precip[id1])
    precipEach= precipEach.interpolate(method='linear', limit_direction='both')
    precipEach_ = precipEach.values.squeeze()  # (712)
    if np.sum(np.isnan(precipEach_)) == 0:
        precipInterp.append(precipEach_)
        precipInterpTime.append(preciptime)
precipInterp = np.array(precipInterp)
precipInterpTime = np.array(precipInterpTime)
precipInterpAnomaly = precipInterp - np.mean(precipInterp, axis = 0)
precipInterpClimatology = np.mean(precipInterp, axis=0)
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init='auto', max_iter=300, tol=0.0001, verbose=0, random_state=6, copy_x=True, algorithm='lloyd')
kmeans.fit(precipInterpAnomaly)
# 获取聚类的中心
cluster_centers = kmeans.cluster_centers_
# 获取你要的数据最终聚类的类别
cluster_labels = kmeans.predict(precipInterpAnomaly)
#===下面这段代码是分析聚类的各类之间相关性，这部分可注释
#==计算两两相关
import seaborn as sns
correlation_matrix = np.corrcoef(cluster_centers)
# 绘制相关性矩阵
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Clusters')
plt.savefig('Correlation_%sCluster_Kmeans_Anomaly.png'%num_clusters)

对于Kmeans聚类，其中一个关键是聚类数K如何确定：（1）根据经验确定一个合适的K。（2）根据肘部法则：在肘部法则中，我们关注的是随着簇数增加，Inertia（簇内平方和）的变化情况。通常情况下，Inertia值越低表示数据点更接近其簇中心，但并非绝对。在肘部法则中，我们寻找的是Inertia急剧下降并趋于平缓的点，这个点可能是最佳的聚类数量。因此，对于肘部法则，我们要找到Inertia值下降幅度显著变小，即形成一个“肘部”样的点，这时候增加簇数不再显著地降低Inertia。

代码：

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import  xarray as xr
import pandas as pd
from scipy import interpolate

def GetKByInertia(ks):
    '''
    在肘部法则中，我们关注的是随着簇数增加，Inertia（簇内平方和）的变化情况。通常情况下，Inertia值越低表示数据点更接近其簇中心，但并非绝对。
    在肘部法则中，我们寻找的是Inertia急剧下降并趋于平缓的点，这个点可能是最佳的聚类数量。
    因此，对于肘部法则，我们要找到Inertia值下降幅度显著变小，即形成一个“肘部”样的点，这时候增加簇数不再显著地降低Inertia。
    ks: 从1-ks类中挑选一个最优的
    '''
    #===========================读取测站数据
    Data = np.load('Precip.npz') #站点降水单位 mm
    precip = Data['precip'][:,:,3]
    preciptime = Data['preciptime']
    # ====降水里面有部分数据是缺测 32700.00 插值一下
    precip[precip  >= 1000] = np.nan
    precipInterp = []
    precipInterpTime = []
    for id1 in range(precip.shape[0]):
        precipEach = pd.DataFrame(precip[id1])
        precipEach= precipEach.interpolate(method='linear', limit_direction='both')
        precipEach_ = precipEach.values.squeeze()  # (712)
        if np.sum(np.isnan(precipEach_)) == 0:
            precipInterp.append(precipEach_)
            precipInterpTime.append(preciptime)
    precipInterp = np.array(precipInterp)
    precipInterpTime = np.array(precipInterpTime)

    precipInterpAnomaly = precipInterp - np.mean(precipInterp, axis = 0)
    # 计算不同聚类数量下的簇内平方和（inertia）
    inertia = []
    for i in range(1, ks):  # 尝试不同数量的簇，这里设置为尝试 1 到 10 个簇
        t0 = time.time()
        kmeans = KMeans(n_clusters=i, n_init='auto')
        kmeans.fit(precipInterpAnomaly)
        inertia.append(kmeans.inertia_)
        t1 = time.time()
        print('%s is done! | 耗时%.3f' % (i, (t1-t0)/60))

    # 使用肘部法则找到最佳的聚类数量
    plt.plot(range(1, ks, 1), inertia)
    plt.xticks(range(1, ks, 1))

    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal k')
    plt.savefig('Kmeans_Inertia.png')

（2）SOM

自组织特征映射网络SOFM又称自组织映射网络SOM，是一种自组织竞争神经网络，一个神经网络接受外界输入模式时，将会分为不同的对应区域，各区域对输入模式具有不同的响应特征，而且这个过程是自动完成的。

本代码是基于minisom完成的，其官网为：https://github.com/JustGlowing/minisom/tree/master/examples

跟Kmeans的数据一样，直接上代码：

if __name__ == "__main__":
    import numpy as np
    import xarray as xr
    import pandas as pd
    from scipy import interpolate
    from datetime import datetime, timedelta
    from minisom import MiniSom
    import matplotlib.pyplot as plt

    #===========================读取测站数据
    Data = np.load('Precip.npz') #站点降水单位 mm
    precip = Data['precip'][:,:,3]
    preciptime = Data['preciptime']
    # ====降水里面有部分数据是缺测 32700.00 插值一下
    precip[precip  >= 1000] = np.nan
    precipInterp = []
    precipInterpTime = []
    for id1 in range(precip.shape[0]):
        precipEach = pd.DataFrame(precip[id1])
        precipEach= precipEach.interpolate(method='linear', limit_direction='both')
        precipEach_ = precipEach.values.squeeze()  # (712)
        if np.sum(np.isnan(precipEach_)) == 0:
            precipInterp.append(precipEach_)
            precipInterpTime.append(preciptime)
    precipInterp = np.array(precipInterp)
    precipInterpTime = np.array(precipInterpTime)
    precipInterpAnomaly = precipInterp - np.mean(precipInterp, axis = 0)

    som_shape = (2, 2) #SOM网络形状，其实就是分多少类
    ClusterNums = int(som_shape[0]*som_shape[1])
    som = MiniSom(som_shape[0], som_shape[1], precipInterpAnomaly.shape[1], sigma=.5, learning_rate=.5,
                  neighborhood_function='gaussian', random_seed=10)
    # 随机初始化权重
    som.random_weights_init(precipInterpAnomaly)
    # 开始训练 SOM
    num_iterations = 10000  # 迭代次数
    som.train_random(precipInterpAnomaly, num_iterations)
    # # 获得每个站点在 SOM 中的位置
    # cluster_mappings = som.win_map(precipInterpAnomaly)
    # each neuron represents a cluster
    winner_coordinates = np.array([som.winner(x) for x in precipInterpAnomaly]).T
    # with np.ravel_multi_index we convert the bidimensional
    # coordinates to a monodimensional index
    cluster_index = np.ravel_multi_index(winner_coordinates, som_shape)

    #=====将每一类聚类 然后样本进行平均
    cluster_means = []
    for cluster_num in range(ClusterNums):
        cluster_data = precipInterpAnomaly[cluster_index == cluster_num]
        cluster_mean = np.mean(cluster_data, axis=0)
        cluster_means.append(cluster_mean)
    cluster_means = np.array(cluster_means)
    LatStation = Data['precip'][0, :, 1]
    LonStation = Data['precip'][0, :, 2]
    for id1 in range(cluster_means.shape[0]):
        SavePathEach = 'ClusterByMeanSamples%sof%s_ByMiniSom.png' % (id1+1, ClusterNums)
        PlotStationData(cluster_means[id1], LatStation, LonStation, SavePathEach)