K-表示用Elbow方法，BIC，方差解释和轮廓选择K的非相干行为

我正在尝试使用K均值对具有90个特征的向量进行聚类。由于此算法询问我簇的数量，因此我想用一些不错的数学方法来验证我的选择。我希望有8到10个集群。功能按Z分数缩放。

肘法和方差解释

from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans

K = range(1,50)
KM = [KMeans(n_clusters=k).fit(dt_trans) for k in K]
centroids = [k.cluster_centers_ for k in KM]

D_k = [cdist(dt_trans, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]
avgWithinSS = [sum(d)/dt_trans.shape[0] for d in dist]

# Total with-in sum of square
wcss = [sum(d**2) for d in dist]
tss = sum(pdist(dt_trans)**2)/dt_trans.shape[0]
bss = tss-wcss

kIdx = 10-1

# elbow curve
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(K, avgWithinSS, 'b*-')
ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, 
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering')

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(K, bss/tss*100, 'b*-')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Percentage of variance explained')
plt.title('Elbow for KMeans clustering')

从这两张图片看来，簇的数量从未停止：D。奇怪！肘在哪里？如何选择K？

贝叶斯信息准则

此方法直接来自X均值，并使用BIC选择簇数。另一个参考

    from sklearn.metrics import euclidean_distances
from sklearn.cluster import KMeans

def bic(clusters, centroids):
    num_points = sum(len(cluster) for cluster in clusters)
    num_dims = clusters[0][0].shape[0]
    log_likelihood = _loglikelihood(num_points, num_dims, clusters, centroids)
    num_params = _free_params(len(clusters), num_dims)
    return log_likelihood - num_params / 2.0 * np.log(num_points)


def _free_params(num_clusters, num_dims):
    return num_clusters * (num_dims + 1)


def _loglikelihood(num_points, num_dims, clusters, centroids):
    ll = 0
    for cluster in clusters:
        fRn = len(cluster)
        t1 = fRn * np.log(fRn)
        t2 = fRn * np.log(num_points)
        variance = _cluster_variance(num_points, clusters, centroids) or np.nextafter(0, 1)
        t3 = ((fRn * num_dims) / 2.0) * np.log((2.0 * np.pi) * variance)
        t4 = (fRn - 1.0) / 2.0
        ll += t1 - t2 - t3 - t4
    return ll

def _cluster_variance(num_points, clusters, centroids):
    s = 0
    denom = float(num_points - len(centroids))
    for cluster, centroid in zip(clusters, centroids):
        distances = euclidean_distances(cluster, centroid)
        s += (distances*distances).sum()
    return s / denom

from scipy.spatial import distance
def compute_bic(kmeans,X):
    """
    Computes the BIC metric for a given clusters

    Parameters:
    -----------------------------------------
    kmeans:  List of clustering object from scikit learn

    X     :  multidimension np array of data points

    Returns:
    -----------------------------------------
    BIC value
    """
    # assign centers and labels
    centers = [kmeans.cluster_centers_]
    labels  = kmeans.labels_
    #number of clusters
    m = kmeans.n_clusters
    # size of the clusters
    n = np.bincount(labels)
    #size of data set
    N, d = X.shape

    #compute variance for all clusters beforehand
    cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)])

    const_term = 0.5 * m * np.log(N) * (d+1)

    BIC = np.sum([n[i] * np.log(n[i]) -
               n[i] * np.log(N) -
             ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
             ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term

    return(BIC)



sns.set_style("ticks")
sns.set_palette(sns.color_palette("Blues_r"))
bics = []
for n_clusters in range(2,50):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(dt_trans)

    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    clusters = {}
    for i,d in enumerate(kmeans.labels_):
        if d not in clusters:
            clusters[d] = []
        clusters[d].append(dt_trans[i])

    bics.append(compute_bic(kmeans,dt_trans))#-bic(clusters.values(), centroids))

plt.plot(bics)
plt.ylabel("BIC score")
plt.xlabel("k")
plt.title("BIC scoring for K-means cell's behaviour")
sns.despine()
#plt.savefig('figures/K-means-BIC.pdf', format='pdf', dpi=330,bbox_inches='tight')

在此处输入图片说明

同样的问题在这里...什么是K？

轮廓

    from sklearn.metrics import silhouette_score

s = []
for n_clusters in range(2,30):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(dt_trans)

    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    s.append(silhouette_score(dt_trans, labels, metric='euclidean'))

plt.plot(s)
plt.ylabel("Silouette")
plt.xlabel("k")
plt.title("Silouette for K-means cell's behaviour")
sns.despine()

在此处输入图片说明

阿莱路亚！这似乎很有意义，这就是我的期望。但是为什么这与其他不同呢？

clustering k-means

— 马尔科德纳
source

要回答关于方差情况下的膝盖问题，它看起来大约是6或7，您可以将其想象为曲线的两个线性近似线段之间的断点。图的形状并不罕见，方差百分比通常会渐近地接近100％。我把K的你BIC图作为低一点，围绕5

— image_doctor

但是我在所有方法中都应该（或多或少）得到相同的结果，对吗？

— marcodena'7

我想我还不足以说。我非常怀疑这三种方法在数学上是否等同于所有数据，否则它们不会作为不同的技术存在，因此比较结果取决于数据。两种方法给出的簇数接近，第三种更高，但不是很大。您是否具有有关真实簇数的先验信息？

— image_doctor

我不确定100％，但是我希望有8到10个群集

— marcodena

您已经在“维数诅咒”的黑洞中。降维之前没有任何工作。

— Kasra Manshaei 2015年

Answers:

只需发布以上评论和更多想法的摘要，以便将该问题从“未回答的问题”中删除。

Image_doctor的评论是正确的，这些图是k均值的典型代表。（不过，我对“剪影”度量并不熟悉。）集群内方差预计会随着k的增加而连续下降。弯头是弯曲最大的地方。（如果您想要数学的话，也许会认为是“二阶导数”。）

通常，最好使用最终任务选择k。不要使用群集的统计量来做出决定，而要使用系统的端到端性能来指导您的选择。仅将统计信息用作起点。

— 约阿希姆·瓦格纳（Joachim Wagner）
source

通过计算连续段之间的角度，可以更轻松地找到弯头。

更换：

kIdx = 10-1

与：

seg_threshold = 0.95 #Set this to your desired target

#The angle between three points
def segments_gain(p1, v, p2):
    vp1 = np.linalg.norm(p1 - v)
    vp2 = np.linalg.norm(p2 - v)
    p1p2 = np.linalg.norm(p1 - p2)
    return np.arccos((vp1**2 + vp2**2 - p1p2**2) / (2 * vp1 * vp2)) / np.pi

#Normalize the data
criterion = np.array(avgWithinSS)
criterion = (criterion - criterion.min()) / (criterion.max() - criterion.min())

#Compute the angles
seg_gains = np.array([0, ] + [segments_gain(*
        [np.array([K[j], criterion[j]]) for j in range(i-1, i+2)]
    ) for i in range(len(K) - 2)] + [np.nan, ])

#Get the first index satisfying the threshold
kIdx = np.argmax(seg_gains > seg_threshold)

您会看到类似以下内容：

如果可视化seg_gains，您将看到类似以下内容：

我希望您现在可以找到棘手的肘:)

— 萨洛尔
source

我创建了一个Python 库，尝试实现Kneedle算法来检测此类函数中的最大曲率点。可以使用进行安装pip install kneed。

四种不同形状的函数的代码和输出：

from kneed.data_generator import DataGenerator
from kneed.knee_locator import KneeLocator

import numpy as np

import matplotlib.pyplot as plt

# sample x and y
x = np.arange(0,10)
y_convex_inc = np.array([1,2,3,4,5,10,15,20,40,100])
y_convex_dec = y_convex_inc[::-1]
y_concave_dec = 100 - y_convex_inc
y_concave_inc = 100 - y_convex_dec

# find the knee points
kn = KneeLocator(x, y_convex_inc, curve='convex', direction='increasing')
knee_yconvinc = kn.knee

kn = KneeLocator(x, y_convex_dec, curve='convex', direction='decreasing')
knee_yconvdec = kn.knee

kn = KneeLocator(x, y_concave_inc, curve='concave', direction='increasing')
knee_yconcinc = kn.knee

kn = KneeLocator(x, y_concave_dec, curve='concave', direction='decreasing')
knee_yconcdec = kn.knee

# plot
f, axes = plt.subplots(2, 2, figsize=(10,10));
yconvinc = axes[0][0]
yconvdec = axes[0][1]
yconcinc = axes[1][0]
yconcdec = axes[1][1]

yconvinc.plot(x, y_convex_inc)
yconvinc.vlines(x=knee_yconvinc, ymin=0, ymax=100, linestyle='--')
yconvinc.set_title("curve='convex', direction='increasing'")

yconvdec.plot(x, y_convex_dec)
yconvdec.vlines(x=knee_yconvdec, ymin=0, ymax=100, linestyle='--')
yconvdec.set_title("curve='convex', direction='decreasing'")

yconcinc.plot(x, y_concave_inc)
yconcinc.vlines(x=knee_yconcinc, ymin=0, ymax=100, linestyle='--')
yconcinc.set_title("curve='concave', direction='increasing'")

yconcdec.plot(x, y_concave_dec)
yconcdec.vlines(x=knee_yconcdec, ymin=0, ymax=100, linestyle='--')
yconcdec.set_title("curve='concave', direction='decreasing'");

— 凯文
source