KMeans 算法

KMeans最主要是思想是

  1. 播撒K个标识点,K是你要分成几类。
  2. 网络中所有点对这K个点计算距离,并归入最近的一个标识点中。这样就得到K个类
  3. 重新计算这K个类的中心距离。
  4. 是否要更新K个类的中心距离,是否迭代次数为0,如果要更新标识点的位置,并且迭代次数不为0,更新标识点,并跳转回2.

下面是为FPT

写的KMeans算法。这个FPT分裂时只要分成两类,所以就是K=2.单纯的KMeans没有像我写的这样复杂。我这里的距离是计算的两个位数组中的差异位数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// only divide two parts
void KMeans2(FPN * current,FPN * &left, FPN * &right, const Record<N>& record)
{
// prepare
int k = 2;
int count = current->count+1;
KMeansNode<N,order,KHash> *nodes = new KMeansNode<N,order,KHash>[count];
int* classLabel = new int[count];
int* totalDistance = new int[count];
memset(totalDistance,0,count*sizeof(int));
memset(classLabel,-1,count*sizeof(int));
for(int i = 0; i < current->count; i++)
{
nodes[i].Set(current->bfilters[i],current->paths[i],current->branches[i],current->support[i]);
}
nodes[count - 1].Set(record,NULL);
// train
BloomFilter<N>* centers = new BloomFilter<N>[k];
int* cdistance = new int[k]; // the distance between the center and each node
memset(cdistance,0,k*sizeof(int));
for(int i =0 ; i < k; i++)
{
centers[i].Set(rand()%(int)pow(2.0,(double)N-1));
}
for(int x = 0; x < 500; x++)
{
// iterator
for(int i = 0; i < count; i++)
{
// for each node to compute the distance with centers
int min = N+1;
for(int j = 0; j < k; j++)
{
int distance = BloomFilter<N>::Distance(nodes[i].record.bfilter,centers[j]);
if(distance < min)
{
min = distance;
classLabel[i] = j;
}
}
}
// recompute the centers
// compute the distance between each node in the same class label
for(int i = 0; i < count; i++)
{
totalDistance[i] = 0;
for(int j=0; j < count; j++)
{
if(i != j && classLabel[i] == classLabel[j])
{
// the same class but not itself
totalDistance[i] += BloomFilter<N>::Distance(nodes[i].record.bfilter,nodes[j].record.bfilter);
}
}
}
// compute the distances between the center and each node
for(int i = 0; i < count; i++)
{
cdistance[classLabel[i]]+=BloomFilter<N>::Distance(centers[classLabel[i]],nodes[i].record.bfilter);
}
int flag = false;
for(int i = 0; i < count; i++)
{
if(cdistance[classLabel[i]] > totalDistance[i])
{
centers[i].Set(nodes[i].record.bfilter);
flag = true;
cdistance[classLabel[i]] = totalDistance[i];
}
}
// no update the center then break the loop
if(!flag)
{
break;
}
}
right = new FPN();
left = new FPN();
for(int i = 0; i < count; i++)
{
if(classLabel[i] == 0)
{
// insert left
left->Insert(nodes[i].record,nodes[i].branch);
}
else
{
// insert right
right->Insert(nodes[i].record,nodes[i].branch);
}
}
delete current;
delete [] nodes;
delete [] centers;
delete [] totalDistance;
delete [] classLabel;
delete [] cdistance;
}
  • 本文作者: 帐前卒
  • 本文链接: http://chillyc.info/2009/4104044/
  • 版权声明: 本博客所有文章除特别声明外,只能复制超链接地址,且必须注明出处!