Algorithm
Previously, the membership function c(i,j):
c(i,j) = 1, if $x_i$ belongs to class j;
c(i,j) = 0, if $x_i$ does not to class j.
Here, we allow 0 <= c(i,j) <= 1, for a probabilistic description.
The error criterion generalizes the k-means error criterion:
$J = sumlimits_{i=1}^n sumlimits_{j=1}^k c(i,j)^m||x_i-u_j||^2$
$E = sumlimits_{i=1}^n |x_i-u_{c(i)}|^2$ (k-means)
m is the hyperparameter that controls how fuzzy the cluster will be.
The higher it is, the fuzzier the cluster will be in the end.
Given $u_j$, consider the membership values of $x_i$. It should be inversely related to $||x_i- u_j||$
for all i,j, $widehat{c}(i,j) = frac 1 {||x_i- u_j||^{frac 2 {m-1}}} $
for all i,j, normalize $widehat{c}(i,j)$ over $z_i$:
$c(i,j) = frac {widehat{c}(i,j)} {z_i} $ where $z_i = sum limits_{j=1}^k widehat{c}(i,j)$
Given the c(i,j), the new c-means are computed by:
for all j:
$u_j = frac{sumlimits_{i=1}^n c(i,j)^mx_i} {sumlimits_{i=1}^n c(i,j)^m}$
Implementation
def randomInt(low, high, size):
'''
return a list with all values are integers, no repeating, and with len equals to size.
'''
import numpy as np
ret = []
while(len(ret) < size):
v = np.random.randint(low, high)
if v not in ret:
ret.append(v)
return ret
def convertSoftToHardClustering(C, n, k):
'''
C: is a dictionary. {(0, 2): 0.9, ...} means x_0 belongs to class 2 with possibility 0.9
n is number of points in data
k is the number of Clusters
'''
import numpy as np
Labels = []
for i in range(n):
possible_class = {}
for j in range(k):
possible_class[j]= C.get((i,j), 0)
import operator
final_c = max(possible_class.items(), key=operator.itemgetter(1))[0]
final_c += 1
Labels.append(final_c)
return Labels
def normalizeP(P):
'''
P is a list
'''
Z = sum(P)
posibilities = [item/Z for item in P]
return posibilities
def initalstepforLloyd(Xt, k):
U = []
n,m = Xt.shape # n is number of points
idx = randomInt(0, n, k)
for i in idx:
U.append(Xt[i,:])
return U
def initalstepforKmeansPP(Xt, k, W = None):
'''
Xt: a row is a point
k: the number of clusters
here, k means that we need to choose len(U) = k
'''
n,m = Xt.shape # n is number of points
#Select u_0
idx = randomInt(0, n, 1)
U = [Xt[j,:] for j in idx]
# Select u_1,...u_{k-1}
for j in range(1,k):
# calculate the P
P = []
for i in range(n):
# calculate distance of each x_i to U
ds = []
for u in U:
d = sum((Xt[i,:] - u)**2)
ds.append(d)
posi = min(ds)
if(W != None):
posi = W[i]*posi
P.append(posi )
P = normalizeP(P)
#print(P)
# select u_j
idx_u_j = np.random.choice(n, 1, p=P)[0]
U.append(Xt[idx_u_j,:])
return U
def initalstepforKmeansParallelFirst(Xt, p, li):
'''
Xt: a row is a point
p: number of passes
li: a value for the oversampling factor.
'''
n,m = Xt.shape # n is number of points
#Select u_0
idx = randomInt(0, n, 1)
U = [Xt[j,:] for j in idx]
# p passes
for j in range(p):
# calculate the P
P = []
for i in range(n):
# calculate distance of each x_i to U
ds = []
for u in U:
d = sum((Xt[i,:] - u)**2)
ds.append(d)
P.append(min(ds))
P = normalizeP(P)
# select li points
indexes = np.random.choice(n, li, p=P)
for idx in indexes:
U.append(Xt[idx,:])
return U
def initalstepforKmeansParallelSecond(Xt, U1, k):
'''
U1: is obtained from the first step with len(U) >= k
We use weighted KmeansPP to select k from U1
COUNTS: the number of points that are cloasest to u_j.
We can obtain this by Kmeans given U with one iteration.
'''
import numpy as np
L,U,E = Kmeans(Xt, len(U1), 1, givenU=U1)
COUNTS = []
for j in range(1,len(U1)+1):
COUNTS.append(L.count(j))
W = normalizeP(COUNTS)
U1 = np.array(U1)
U = initalstepforKmeansPP(U1, k, W)
return U
def initalstepforKmeansParallel(Xt, k):
p = 5
li = int(k/2)
U1 = initalstepforKmeansParallelFirst(Xt, p, li)
U = initalstepforKmeansParallelSecond(Xt, U1, k)
# print('final U:', U)
return U
def Kmeans(Xt, k, r, initialFun=None, givenU=None):
DEBUG = False
import numpy as np
n,m = Xt.shape # n is number of points
C = {} #{(0, 0): 1, ...} means x_0 belongs to class 0
for iteration in range(r):
if DEBUG: print('----iteration:', iteration)
# 1. U
U = [] #[(5,1),...] means u_0 is (5,1)
if (iteration == 0):
# Select initial U
if(givenU == None):
U = initialFun(Xt, k)
else:
U = givenU
else: # calulate U
for j in range(k):
sum_j = 0
m_j = 0
for i in range(n):
if C.get((i,j), 0) == 1:
sum_j += Xt[i,:]
m_j += 1
U.append(tuple(sum_j/m_j))
if DEBUG: print('U:', U)
# 2. C
# 2.1 calculate distances |x_i - u_j|^2
#[[26.0, 36.0], ...] the first one means the distance of x_0 to all u_j in U
D = []
for i in range(n):
ds = []
for u in U:
d = sum((Xt[i,:] - u)**2)
ds.append(d)
D.append(ds)
if DEBUG: print('D:', D)
# 2.2 assign class
C = {}
for i in range(len(D)):
j = np.argmin(D[i])
C[i,j] = 1
if DEBUG: print('C:', C)
# convert C into labels
L = convertSoftToHardClustering(C, n, k)
E = sum([min(d) for d in D])
return (L,U,E)
def Cmeans(Xt, k, r, mm, initialFun=None, givenU=None):
DEBUG = False
import numpy as np
n,m = Xt.shape # n is number of points
C = {} #{(0, 0): 1, ...} means x_0 belongs to class 0
for iteration in range(r):
if DEBUG: print('----iteration:', iteration)
# 1. U
U = [] #[(5,1),...] means u_0 is (5,1)
if (iteration == 0):
# Select initial U
if(givenU == None):
U = initialFun(Xt, k)
else:
U = givenU
else: # calulate U
for j in range(k):# u_j
sum_j = 0
m_j = 0
for i in range(n):
sum_j += (C.get((i,j), 0)**mm)*Xt[i,:]
m_j += C.get((i,j), 0)**mm
U.append(tuple(sum_j/m_j))
# if DEBUG:
# print('U:', U)
# 2. C
# 2.1 calculate distances |x_i - u_j|^2
#[[26.0, 36.0], ...] the first one means the distance of x_0 to all u_j in U
D = {} # here, D is C^.
for i in range(n):
for j in range(k):
import math
d = sum((Xt[i,:] - U[j])**2)
if(d == 0):
D[i,j] = -1 # mark, in this case, C[i,j] = 1
else:
D[i,j] = (1/d)**(1/(mm-1))
if DEBUG: print('D:', D)
#for D[i,j]==-1
for i in range(n):
for j in range(k):
if(D[i,j]==-1):
for idx in range(k):
if(idx != j):
D[i,idx] = 0
else:
D[i,idx] = 1
if DEBUG: print('D^:', D)
# 2.2 assign class/normalize C^
C = {}
for i in range(n):
z_i = 0
for j in range(k):
z_i += D[i,j]
for j in range(k):
C[i,j] = D[i,j]/z_i
if DEBUG: print('C:', C)
# convert C into labels
# print('C', C)
L = convertSoftToHardClustering(C, n, k)
DD = []
for i in range(n):
ds = []
for j in range(k):
import math
# d = math.sqrt(sum((Xt[i,:] - U[j])**2))
d = sum((Xt[i,:] - U[j])**2)
ds.append(d)
dd = min(ds)
DD.append(dd)
E = sum(DD)
return (L,U, E)
Evaluation
def plotData(Xt):
'''
Xt: a row is a point
'''
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(Xt[:,0], Xt[:,1])
ax.axis('equal')
def plotClusters(Xt, L, U=None):
'''
Xt: a row is a point
L: labels coresponding to Xt
'''
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.axis('equal')
n,m = Xt.shape # n is number of points
Group = []
for j in range(k):
group_j = []
for i in range(n):
if L[i] == (j+1):
group_j.append(Xt[i,:])
Group.append(group_j)
for group in Group:
g = np.array(group)
nn= g.shape[0]
if(nn > 0):
ax.scatter(g[:,0], g[:,1], alpha=0.7)
if (U!=None):
U = np.array(U)
ax.scatter(U[:,0], U[:,1], marker='X', s=70, c='r')
# for test
# Load Data
import numpy as np
filename = "iris.data.txt"
Xt = np.genfromtxt(filename, delimiter=',', autostrip=True)
k = 3
r = 20 # for converge, in the future, I will remove this. Simply, repeat until converge
Iterations = 10 # choose the best one. For future evaluation.
#Kmeans
Es = []
Us = []
Ls = []
for i in range(Iterations):
L,U,E = Kmeans(Xt, k, r, initialFun=initalstepforLloyd)
Ls.append(L)
Us.append(U)
Es.append(E)
best = np.argmin(Es)
L = Ls[best]
U = Us[best]
E = Es[best]
for j in range(1,k+1):
print(j,':', L.count(j))
print(E)
plotClusters(Xt, L, U)
1 : 62
2 : 50
3 : 38
78.9408414261
#Kmeans++
Es = []
Us = []
Ls = []
for i in range(Iterations):
L,U,E = Kmeans(Xt, k, r, initialFun=initalstepforKmeansPP)
Ls.append(L)
Us.append(U)
Es.append(E)
best = np.argmin(Es)
L = Ls[best]
U = Us[best]
E = Es[best]
for j in range(1,k+1):
print(j,':', L.count(j))
print(E)
print('U', U)
plotClusters(Xt, L, U)
1 : 62
2 : 50
3 : 38
78.9408414261
U [(5.9016129032258071, 2.7483870967741941, 4.3935483870967751, 1.4338709677419357), (5.0059999999999993, 3.4180000000000006, 1.464, 0.24399999999999991), (6.8500000000000005, 3.0736842105263151, 5.7421052631578933, 2.0710526315789473)]
#Kmeans Parallel
Es = []
Us = []
Ls = []
for i in range(Iterations):
L,U,E = Kmeans(Xt, k, r, initialFun=initalstepforKmeansParallel)
Ls.append(L)
Us.append(U)
Es.append(E)
best = np.argmin(Es)
L = Ls[best]
U = Us[best]
E = Es[best]
for j in range(1,k+1):
print(j,':', L.count(j))
print(E)
print('U', U)
plotClusters(Xt, L, U)
1 : 62
2 : 50
3 : 38
78.9408414261
U [(5.9016129032258071, 2.7483870967741941, 4.3935483870967751, 1.4338709677419357), (5.0059999999999993, 3.4180000000000006, 1.464, 0.24399999999999991), (6.8500000000000005, 3.0736842105263151, 5.7421052631578933, 2.0710526315789473)]
#Cmeans
Es = []
Us = []
Ls = []
Iterations = 100
mm = 3
for i in range(Iterations):
L,U,E = Cmeans(Xt, k, r, mm, initialFun=initalstepforLloyd)
Ls.append(L)
Us.append(U)
Es.append(E)
best = np.argmin(Es)
L = Ls[best]
U = Us[best]
E = Es[best]
for j in range(1,k+1):
print(j,':', L.count(j))
print(E)
print('U', U)
plotClusters(Xt, L, U)
1 : 59
2 : 41
3 : 50
80.97541316
U [(5.9108592005116769, 2.7917436493396055, 4.3795234999340673, 1.3969150114682978), (6.695439882813317, 3.0376285801395135, 5.5518773819112255, 2.0356630177264541), (5.0010566452105811, 3.3893281471514598, 1.4942916006649354, 0.25196092517731689)]
#Cmeans
Es = []
Us = []
Ls = []
Iterations = 100
mm = 0.3
for i in range(Iterations):
L,U,E = Cmeans(Xt, k, r, mm, initialFun=initalstepforLloyd)
Ls.append(L)
Us.append(U)
Es.append(E)
best = np.argmin(Es)
L = Ls[best]
U = Us[best]
E = Es[best]
for j in range(1,k+1):
print(j,':', L.count(j))
print(E)
print('U', U)
plotClusters(Xt, L, U)
1 : 0
2 : 92
3 : 58
680.165925313
U [(5.8435799406544042, 3.0539119304103188, 3.7593304117474982, 1.1989357046657578), (5.8428172563321166, 3.054184542678152, 3.7572771469913873, 1.1981035040438766), (5.8436021692224127, 3.053903968367373, 3.759390280759455, 1.1989599678164726)]
For reproduction, please specify:GHWAN’s website » Fuzzy c-means Clustering
近期评论