-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclustering_functions.py
More file actions
48 lines (40 loc) · 1.62 KB
/
clustering_functions.py
File metadata and controls
48 lines (40 loc) · 1.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import numpy as np
def distance(X,mu):
# calculate the euclidean distance between numpy arrays X and mu
return np.sum(np.square(np.subtract(X, mu)), axis=1)
def findClosestCentres(X,mu):
# finds the centre in mu closest to each point in X
m = X.shape[0] # number of points
k = mu.shape[0] # number of centres
C = list()
# create list of empty arrays
# tried with various np.empty permutations but no luck
for i in range(k):
C.append([])
# for each point in X, find the point in mu where the distance
# is minimum
for i in range(m):
# get array of all distances and the min value of the array
distances = distance(X[i], mu)
min_val = distances.min()
# np.where returns a tuple, the first element [0] is the np array
# abs can only work on one value so we can't do this step above without
# changing the behaviour of distance as well as this function
min_index = np.where(abs(min_val) == distances)[0]
# min index is actually an np.array so
# we access the value by doing [0] again
# and append the current index of X at the index within the array
# of centres
C[min_index[0]].append(i)
return C
def updateCentres(X,C):
# updates the centres to be the average of the points closest to it.
k = len(C) # k is number of centres
n = X.shape[1] # n is number of features
mu = np.zeros((k,n))
for i in range(len(C)):
# for each list of centres, np.sum X[list_indices] along the
# vertical axis i.e. X[i][0] + X[i+1][0]...
# and divide by the number of centres : len(C[i])
mu[i] = np.divide(np.sum(X[C[i]], axis=0), len(C[i]))
return mu