-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsequential_kmeans.py
More file actions
113 lines (100 loc) · 3.44 KB
/
sequential_kmeans.py
File metadata and controls
113 lines (100 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import csv, time, random, math
import matplotlib.pyplot as plt
import pandas as pd
colors = ['red', 'green', 'blue','yellow','black']
def eucl_distance(point_one, point_two):
if(len(point_one) != len(point_two)):
raise Exception("Error: non comparable points")
sum_diff = 0.0
for i in range(len(point_one)):
diff = pow((float(point_one[i]) - float(point_two[i])), 2)
sum_diff += diff
final = math.sqrt(sum_diff)
return final
def compare_center(initial_center, derived_center, dimensions, num_clusters, cutoff):
#print("initial center:{}\n \nderived center:{}\ncutoff:{}".format(initial_center,derived_center,cutoff))
if(len(initial_center) != len(derived_center)):
raise Exception("Error: non comparable points")
flag = 0
for i in range(num_clusters):
diff = eucl_distance(initial_center[i], derived_center[i])
if(diff < cutoff):
flag += 1
return flag
def kmeans(points, num_clusters, cutoff, initial, dimensions):
clusters = []
for i in range(num_clusters):
clusters.append([])
for i in points:
min_val = 10000000000000
cluster_no = 0
x = 0
for j in initial:
dist = eucl_distance(i,j)
x = x + 1
if(dist < min_val):
min_val = dist
cluster_no = x
#print("i={}\n,j={}\n,dist={}\n,cluster_no={}\n".format(i,j,dist,cluster_no))
clusters[cluster_no-1].append(i)
center = []
for i in clusters:
center_val = [0] * dimensions
no_of_values = 0
for j in i:
no_of_values = no_of_values + 1
for k in range(dimensions):
center_val[k] += float(j[k])
for m in range(dimensions):
if no_of_values != 0:
center_val[m] = center_val[m] / no_of_values
center.append(center_val)
#print("centers",center)
compare_val = compare_center(initial, center, dimensions, num_clusters, cutoff)
if(compare_val == num_clusters):
curX, curY = [], []
iter_count = 0
for points in clusters:
curX, curY = [], []
for point in points:
curX.append(point[4])
curY.append(point[8])
plt.scatter(curX,curY,c = colors[iter_count])
iter_count += 1
plt.xlabel("Video Game Sales in North America")
plt.ylabel("Video Game Sales around Globe")
plt.title("Clustering Output")
plt.savefig("clustering_data.png")
return 1, center
else:
return 0, center
def main():
dataset = []
print ("Enter the number of clusters you want to make:")
num_clusters = int(input())
with open('modified.csv', 'r') as f:
reader = csv.reader(f)
dataset = list(reader)
data = dataset
data.pop(0)
#print(dataset[0])
num_points = len(data)
cutoff = 0.2
dimensions = len(data[0])
initial = []
for i in range(num_clusters):
initial.append(data[i])
#print("initial",initial)
start_time = time.time()
while(True):
val, center = kmeans(data, num_clusters, cutoff, initial, dimensions)
if(val == 1):
break
initial = center
i = i + 1
print ("Final Centers are:")
for i in center:
print (i,"\n")
print ("Execution time %s seconds" % (time.time() - start_time))
if __name__ == "__main__":
main()