谱聚类 java_谱聚类 – 张朝阳 – 博客园

  • Post author:
  • Post category:java


广义上来说,任何在算法中用到SVD/特征值分解的,都叫SpectralAlgorithm。顺便说一下,对于任意矩阵只存在奇异值分解,不存在特征值分解。对于正定的对称矩阵,奇异值就是特征值,奇异向量就是特征向量。

传统的聚类算法,如K-Means、EM算法都是建立在凸球形样本空间上,当样本空间不为凸时,算法会陷入局部最优,最终结果受初始参数的选择影响比较大。而谱聚类可以在任意形状的样本空间上聚类,且收敛于全局最优解。

谱聚类和CHAMELEON聚类很像,都是把样本点的相似度放到一个带权无向图中,采用“图划分”的方法进行聚类。只是谱聚类算法在进行图划分的时候发现计算量很大,转而求特征值去了,而且最后还在几个小特征向量组成的矩阵上进行了K-Means聚类。

Simply speaking,谱聚类算法分为3步:

构造一个N×N的权值矩阵W,Wij表示样本i和样本j的相似度,显然W是个对称矩阵。相似度的计算方法很多了,你可以用欧拉距离、街区距离、向量夹角、皮尔森相关系数等。并不是任意两个点间的相似度都要表示在图上,我们希望的权值图是比较稀疏的,有2种方法:权值小于阈值的认为是0;K最邻近方法,即每个点只和跟它最近的k个点连起来,CHAMELEON算法的第1阶段就是这么干的。再构造一个对角矩阵D,Dii为W第i列元素之和。最后构造矩阵L=D-W。可以证明L是个半正定和对称矩阵。

求L的前K小特征值对应的特征向量(这要用到奇异值分解了)。把K个特征向量放在一起构造一个N×K的矩阵M。

把M的每一行当成一个新的样本点,对这N个新的样本点进行K-Means聚类。

从文件读入样本点,最终算得矩阵L

#include

#include

#include”matrix.h”

#include”svd.h”

#define N 19//样本点个数

#define K 4//K-Means算法中的K

#define T 0.1//样本点之间相似度的阈值

double sample[N][2];//存放所有样本点的坐标(2维的)

void readSample(char *filename){

FILE *fp;

if((fp=fopen(filename,”r”))==NULL){

perror(“fopen”);

exit(0);

}

char buf[50]={0};

int i=0;

while(fgets(buf,sizeof(buf),fp)!=NULL){

char *w=strtok(buf,” \t”);

double x=atof(w);

w=strtok(NULL,” \t”);

double y=atof(w);

sample[i][0]=x;

sample[i][1]=y;

i++;

memset(buf,0x00,sizeof(buf));

}

assert(i==N);

fclose(fp);

}

double** getSimMatrix(){

//为二维矩阵申请空间

double **matrix=getMatrix(N,N);

//计算样本点两两之间的相似度,得到矩阵W

int i,j;

for(i=0;i

matrix[i][i]=1;

for(j=i+1;j

double dist=sqrt(pow(sample[i][0]-sample[j][0],2)+pow(sample[i][1]-sample[j][1],2));

double sim=1.0/(1+dist);

if(sim>T){

matrix[j][i]=sim;

matrix[i][j]=sim;

}

}

}

//计算L=D-W

for(j=0;j

double sum=0;

for(i=0;i

sum+=matrix[i][j];

if(i!=j)

matrix[i][j]=0-matrix[i][j];

}

matrix[j][j]=matrix[j][j]-sum;

}

return matrix;

}

int main(){

char *file=”/home/orisun/data”;

readSample(file);

double **L=getSimMatrix();

printMatrix(L,N,N);

double **M=singleVector(L,N,N,5);

printMatrix(M,N,5);

freeMatrix(L,N);

return 0;

}

L已是对称矩阵,直接奇异值分解的得到的就是特征向量

#ifndef _MATRIX_H

#define _MATRIX_H

#include

#include

#include

//初始化一个二维矩阵

double** getMatrix(int rows,int columns){

double **rect=(double**)calloc(rows,sizeof(double*));

int i;

for(i=0;i

rect[i]=(double*)calloc(columns,sizeof(double));

return rect;

}

//返回一个单位矩阵

double** getIndentityMatrix(int rows){

double** IM=getMatrix(rows,rows);

int i;

for(i=0;i

IM[i][i]=1.0;

return IM;

}

//返回一个矩阵的副本

double** copyMatrix(double** matrix,int rows,int columns){

double** rect=getMatrix(rows,columns);

int i,j;

for(i=0;i

for(j=0;j

rect[i][j]=matrix[i][j];

return rect;

}

//从一个一维矩阵得到一个二维矩阵

void getFromArray(double** matrix,int rows,int columns,double *arr){

int i,j,k=0;

for(i=0;i

for(j=0;j

matrix[i][j]=arr[k++];

}

}

}

//打印二维矩阵

void printMatrix(double** matrix,int rows,int columns){

int i,j;

for(i=0;i

for(j=0;j

printf(“%-10f\t”,matrix[i][j]);

}

printf(“\n”);

}

}

//释放二维矩阵

void freeMatrix(double** matrix,int rows){

int i;

for(i=0;i

free(matrix[i]);

free(matrix);

}

//获取二维矩阵的某一行

double* getRow(double **matrix,int rows,int columns,int index){

assert(index

double *rect=(double*)calloc(columns,sizeof(double));

int i;

for(i=0;i

rect[i]=matrix[index][i];

return rect;

}

//获取二维矩阵的某一列

double* getColumn(double **matrix,int rows,int columns,int index){

assert(index

double *rect=(double*)calloc(rows,sizeof(double));

int i;

for(i=0;i

rect[i]=matrix[i][index];

return rect;

}

//设置二维矩阵的某一列

void setColumn(double **matrix,int rows,int columns,int index,double *arr){

assert(index

int i;

for(i=0;i

matrix[i][index]=arr[i];

}

//交换矩阵的某两列

void exchangeColumn(double **matrix,int rows,int columns,int i,int j){

assert(i

assert(j

int row;

for(row=0;row

double tmp=matrix[row][i];

matrix[row][i]=matrix[row][j];

matrix[row][j]=tmp;

}

}

//得到矩阵的转置

double** getTranspose(double **matrix,int rows,int columns){

double **rect=getMatrix(columns,rows);

int i,j;

for(i=0;i

for(j=0;j

rect[i][j]=matrix[j][i];

}

}

return rect;

}

//计算两向量内积

double vectorProduct(double *vector1,double *vector2,int len){

double rect=0.0;

int i;

for(i=0;i

rect+=vector1[i]*vector2[i];

return rect;

}

//两个矩阵相乘

double** matrixProduct(double **matrix1,int rows1,int columns1,double **matrix2,int columns2){

double **rect=getMatrix(rows1,columns2);

int i,j;

for(i=0;i

for(j=0;j

double *vec1=getRow(matrix1,rows1,columns1,i);

double *vec2=getColumn(matrix2,columns1,columns2,j);

rect[i][j]=vectorProduct(vec1,vec2,columns1);

free(vec1);

free(vec2);

}

}

return rect;

}

//得到某一列元素的平方和

double getColumnNorm(double** matrix,int rows,int columns,int index){

assert(index

double* vector=getColumn(matrix,rows,columns,index);

double norm=vectorProduct(vector,vector,rows);

free(vector);

return norm;

}

//打印向量

void printVector(double* vector,int len){

int i;

for(i=0;i

printf(“%-15.8f\t”,vector[i]);

printf(“\n”);

}

#endif

#include”matrix.h”

#define ITERATION 100//单边Jacobi最大迭代次数

#define THREASHOLD 0.1

//符号函数

int sign(double number) {

if(number<0)

return -1;

else

return 1;

}

//两个向量进行单边Jacobi正交变换

void orthogonalVector(double *Ci,double *Cj,int len1,double *Vi,double *Vj,int len2,int *pass){

double ele=vectorProduct(Ci,Cj,len1);

if(fabs(ele)

return; //如果两列已经正交,不需要进行变换,则返回true

*pass=0;

double ele1=vectorProduct(Ci,Ci,len1);

double ele2=vectorProduct(Cj,Cj,len1);

double tao=(ele1-ele2)/(2*ele);

double tan=sign(tao)/(fabs(tao)+sqrt(1+pow(tao,2)));

double cos=1/sqrt(1+pow(tan,2));

double sin=cos*tan;

int row;

for(row=0;row

double var1=Ci[row]*cos+Cj[row]*sin;

double var2=Cj[row]*cos-Ci[row]*sin;

Ci[row]=var1;

Cj[row]=var2;

}

for(row=0;row

double var1=Vi[row]*cos+Vj[row]*sin;

double var2=Vj[row]*cos-Vi[row]*sin;

Vi[row]=var1;

Vj[row]=var2;

}

}

//矩阵的两列进行单边Jacobi正交变换。V是方阵,行/列数为columns

void orthogonal(double **matrix,int rows,int columns,int i,int j,int *pass,double **V){

assert(i

double* Ci=getColumn(matrix,rows,columns,i);

double* Cj=getColumn(matrix,rows,columns,j);

double* Vi=getColumn(V,columns,columns,i);

double* Vj=getColumn(V,columns,columns,j);

orthogonalVector(Ci,Cj,rows,Vi,Vj,columns,pass);

int row;

for(row=0;row

matrix[row][i]=Ci[row];

matrix[row][j]=Cj[row];

}

for(row=0;row

V[row][i]=Vi[row];

V[row][j]=Vj[row];

}

free(Ci);

free(Cj);

free(Vi);

free(Vj);

}

//循环正交,进行奇异值分解

void hestens_jacobi(double **matrix,int rows,int columns,double **V)

{

int iteration = ITERATION;

while (iteration– > 0) {

int pass = 1;

int i,j;

for (i = 0; i < columns; ++i) {

for (j = i+1; j < columns; ++j) {

orthogonal(matrix,rows,columns,i,j,&pass,V); //经过多次的迭代正交后,V就求出来了

}

}

if (pass==1) //当任意两列都正交时退出迭代

break;

}

printf(“迭代次数:%d\n”,ITERATION – iteration);

}

//获取矩阵前n小的奇异向量

double **singleVector(double **A,int rows,int columns,int n){

double **V=getIndentityMatrix(columns);

hestens_jacobi(A,rows,columns,V);

double *singular=(double*)calloc(columns,sizeof(double));//特征值

int i,j;

for(i=0;i

double *vector=getColumn(A,rows,columns,i);

double norm=sqrt(vectorProduct(vector,vector,rows));

singular[i]=norm;

}

int *sort=(int*)calloc(columns,sizeof(int));

for(i=0;i

sort[i]=i;

for(i=0;i

int minIndex=i;

int minValue=singular[i];

for(j=i+1;j

if(singular[j]

minValue=singular[j];

minIndex=j;

}

}

//交换sigular的第i个和第minIndex个元素

singular[minIndex]=singular[i];

singular[i]=minValue;

//交换sort的第i个和第minIndex个元素

int tmp=sort[minIndex];

sort[minIndex]=sort[i];

sort[i]=tmp;

}

double **rect=getMatrix(rows,n);

for(i=0;i

for(j=0;j

rect[i][j]=V[i][sort[j]];

}

}

freeMatrix(V,columns);

free(sort);

free(singular);

return rect;

}

最后是运行KMeans的Java代码

package ai;

public class Global {

//计算两个向量的欧氏距离

public static double calEuraDist(double[] arr1,double[] arr2,int len){

double result=0.0;

for(int i=0;i

result+=Math.pow(arr1[i]-arr2[i],2.0);

}

return Math.sqrt(result);

}

}

package ai;

public class DataObject {

String docname;

double[] vector;

int cid;

boolean visited;

public DataObject(int len){

vector=new double[len];

}

public String getName() {

return docname;

}

public void setName(String docname) {

this.docname = docname;

}

public double[] getVector() {

return vector;

}

public void setVector(double[] vector) {

this.vector = vector;

}

public int getCid() {

return cid;

}

public void setCid(int cid) {

this.cid = cid;

}

public boolean isVisited() {

return visited;

}

public void setVisited(boolean visited) {

this.visited = visited;

}

}

package ai;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Iterator;

public class DataSource {

ArrayList objects;

int row;

int col;

public void readMatrix(File dataFile) {

try {

FileReader fr = new FileReader(dataFile);

BufferedReader br = new BufferedReader(fr);

String line = br.readLine();

String[] words = line.split(“\\s+”);

row = Integer.parseInt(words[0]);

// row=1000;

col = Integer.parseInt(words[1]);

objects = new ArrayList(row);

for (int i = 0; i < row; i++) {

DataObject object = new DataObject(col);

line = br.readLine();

words = line.split(“\\s+”);

for (int j = 0; j < col; j++) {

object.getVector()[j] = Double.parseDouble(words[j]);

}

objects.add(object);

}

br.close();

} catch (IOException e) {

e.printStackTrace();

}

}

public void readRLabel(File file) {

try {

FileReader fr = new FileReader(file);

BufferedReader br = new BufferedReader(fr);

String line = null;

for (int i = 0; i < row; i++) {

line = br.readLine();

objects.get(i).setName(line.trim());

}

} catch (IOException e) {

e.printStackTrace();

}

}

public void printResult(ArrayList objects, int n) {

//DBScan是从第1类开始,K-Means是从第0类开始

//for (int i =0; i

for(int i=1;i<=n;i++){

System.out.println(“=============属于第”+i+”类的有:===========================”);

Iterator iter = objects.iterator();

while (iter.hasNext()) {

DataObject object = iter.next();

int cid=object.getCid();

if(cid==i){

System.out.println(object.getName());

//switch(Integer.parseInt(object.getName())/1000){

//case 0:

//System.out.println(0);

//break;

//case 1:

//System.out.println(1);

//break;

//case 2:

//System.out.println(2);

//break;

//case 3:

//System.out.println(3);

//break;

//case 4:

//System.out.println(4);

//break;

//case 5:

//System.out.println(5);

//break;

//default:

//System.out.println(“Go Out”);

//break;

//}

}

}

}

}

}

package ai;

import java.io.File;

import java.util.ArrayList;

import java.util.Iterator;

import java.util.Random;

public class KMeans {

int k; // 指定划分的簇数

double mu; // 迭代终止条件,当各个新质心相对于老质心偏移量小于mu时终止迭代

double[][] center; // 上一次各簇质心的位置

int repeat; // 重复运行次数

double[] crita; // 存放每次运行的满意度

public KMeans(int k, double mu, int repeat, int len) {

this.k = k;

this.mu = mu;

this.repeat = repeat;

center = new double[k][];

for (int i = 0; i < k; i++)

center[i] = new double[len];

crita = new double[repeat];

}

// 初始化k个质心,每个质心是len维的向量,每维均在left–right之间

public void initCenter(int len, ArrayList objects) {

Random random = new Random(System.currentTimeMillis());

int[] count = new int[k]; // 记录每个簇有多少个元素

Iterator iter = objects.iterator();

while (iter.hasNext()) {

DataObject object = iter.next();

int id = random.nextInt(10000)%k;

count[id]++;

for (int i = 0; i < len; i++)

center[id][i] += object.getVector()[i];

}

for (int i = 0; i < k; i++) {

for (int j = 0; j < len; j++) {

center[i][j] /= count[i];

}

}

}

// 把数据集中的每个点归到离它最近的那个质心

public void classify(ArrayList objects) {

Iterator iter = objects.iterator();

while (iter.hasNext()) {

DataObject object = iter.next();

double[] vector = object.getVector();

int len = vector.length;

int index = 0;

double neardist = Double.MAX_VALUE;

for (int i = 0; i < k; i++) {

double dist = Global.calEuraDist(vector, center[i], len); // 使用欧氏距离

if (dist < neardist) {

neardist = dist;

index = i;

}

}

object.setCid(index);

}

}

// 重新计算每个簇的质心,并判断终止条件是否满足,如果不满足更新各簇的质心,如果满足就返回true.len是数据的维数

public boolean calNewCenter(ArrayList objects, int len) {

boolean end = true;

int[] count = new int[k]; // 记录每个簇有多少个元素

double[][] sum = new double[k][];

for (int i = 0; i < k; i++)

sum[i] = new double[len];

Iterator iter = objects.iterator();

while (iter.hasNext()) {

DataObject object = iter.next();

int id = object.getCid();

count[id]++;

for (int i = 0; i < len; i++)

sum[id][i] += object.getVector()[i];

}

for (int i = 0; i < k; i++) {

if (count[i] != 0) {

for (int j = 0; j < len; j++) {

sum[i][j] /= count[i];

}

}

// 簇中不包含任何点,及时调整质心

else {

int a=(i+1)%k;

int b=(i+3)%k;

int c=(i+5)%k;

for (int j = 0; j < len; j++) {

center[i][j] = (center[a][j]+center[b][j]+center[c][j])/3;

}

}

}

for (int i = 0; i < k; i++) {

// 只要有一个质心需要移动的距离超过了mu,就返回false

if (Global.calEuraDist(sum[i], center[i], len) >= mu) {

end = false;

break;

}

}

if (!end) {

for (int i = 0; i < k; i++) {

for (int j = 0; j < len; j++)

center[i][j] = sum[i][j];

}

}

return end;

}

// 计算各簇内数据和方差的加权平均,得出本次聚类的满意度.len是数据的维数

public double getSati(ArrayList objects, int len) {

double satisfy = 0.0;

int[] count = new int[k];

double[] ss = new double[k];

Iterator iter = objects.iterator();

while (iter.hasNext()) {

DataObject object = iter.next();

int id = object.getCid();

count[id]++;

for (int i = 0; i < len; i++)

ss[id] += Math.pow(object.getVector()[i] – center[id][i], 2.0);

}

for (int i = 0; i < k; i++) {

satisfy += count[i] * ss[i];

}

return satisfy;

}

public double run(int round, DataSource datasource, int len) {

System.out.println(“第” + round + “次运行”);

initCenter(len,datasource.objects);

classify(datasource.objects);

while (!calNewCenter(datasource.objects, len)) {

classify(datasource.objects);

}

datasource.printResult(datasource.objects, k);

double ss = getSati(datasource.objects, len);

System.out.println(“加权方差:” + ss);

return ss;

}

public static void main(String[] args) {

DataSource datasource = new DataSource();

datasource.readMatrix(new File(“/home/orisun/test/dot.mat”));

datasource.readRLabel(new File(“/home/orisun/test/dot.rlabel”));

int len = datasource.col;

// 划分为4个簇,质心移动小于1E-8时终止迭代,重复运行7次

KMeans km = new KMeans(4, 1E-10, 7, len);

int index = 0;

double minsa = Double.MAX_VALUE;

for (int i = 0; i < km.repeat; i++) {

double ss = km.run(i, datasource, len);

if (ss < minsa) {

minsa = ss;

index = i;

}

}

System.out.println(“最好的结果是第” + index + “次。”);

}

}



版权声明:本文为weixin_42109178原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。