clustering.cpp 6.47 KB

#include <iostream>
#include <fstream>
#include <float.h>

#include <Eigen/Core>
#include <Eigen/Eigenvalues>


#include "clustering.h"
#include "utils.h"


void complexClusterSpec (std::vector < Complexe > &comps,
                         std::vector < uint > &centroidIdsSpec,
                         std::vector < std::vector < uint > > &clusterIdsSpec) {

    // Clustering of the complexes
    // Create a file with the graphs of each complex
    std::ofstream graphFile;
    graphFile.open ("graphNSPDK.txt");
    for(size_t i = 0, size = comps.size(); i != size; i++) {
          graphFile << comps[i].to_graph(uint(i));
    }
    graphFile.close();


    // SPECTRAL CLUSTERING
    // Call NSPDK for kernel matrix computation
    system("~/Programme/test/crcpred10/src/NSPDK/NSPDK -R 3 -D 3 -gt DIRECTED -ok -fg graphNSPDK.txt > NSPDK_output.txt");

    // Recover kernel matrix
    dlib::matrix<double> K;
    readKernelMatrix(K, uint(comps.size()));

    std::vector < double > data;
    for(long i = 0; i < K.nr(); i++)
        for (long j = 0; j < K.nc(); j++)
            data.push_back(K(i,j));

    // Choose epsilon here
    // median
    double epsilon = 0;
    std::nth_element(data.begin(), data.begin() + (data.size() / 2), data.end());
    epsilon = data[(data.size() / 2)];

    // Choose the number of cluster
    long num_clusters = chooseKSpec(K, epsilon);

    // Perform Spectral clustering
    if(num_clusters > 1) {
        clusteringSpectral(K, num_clusters, centroidIdsSpec, clusterIdsSpec);
    } else {
        centroidIdsSpec.push_back(0);
        std::vector < uint > v = std::vector < uint > (0, K.nr());
        clusterIdsSpec.push_back(v);
    }
}

long chooseKSpec(const dlib::matrix<double> &K, double epsilon) {
    uint i, size, j, size2;
    // Choice of the number of clusters
    std::vector < std::vector < double > > adjacencyMatrix = std::vector < std::vector < double > > (ulong(K.nr()));
    std::vector < double > diagMatrix = std::vector < double > (ulong(K.nr()));
    // initialize adjacency matrix
    for(long i = 0; i < K.nr(); i++) {
        adjacencyMatrix[ulong(i)] = std::vector < double > (ulong(K.nr()));
    }
    // Build a epsilon graph from the data
    double degree = 0;
    for(long i = 0; i < K.nr(); i++) {
        degree = 0;
        for (long j = 0; j < K.nc(); j++) {
            if (i != j and K(i,j)*1000 > epsilon*1000){
                adjacencyMatrix[ulong(i)][ulong(j)] = K(i,j);
                adjacencyMatrix[ulong(j)][ulong(i)] = K(i,j);
                degree += K(i,j);
            } else if (i == j) {
                adjacencyMatrix[ulong(i)][ulong(j)] = 0;
            }
        }
        diagMatrix[ulong(i)] = degree;
    }
    // Laplacian matrix = diagonal matrix - adjacency matrix
    // diagonal matrix : degree of each node on the diagonal and 0 otherwise
    Eigen::MatrixXd laplacian(K.nr(), K.nr());
    for(i = 0, size = uint(adjacencyMatrix.size()); i != size; i++) {
        for (j = 0, size2 = uint(adjacencyMatrix[i].size()); j != size2; j++) {
            if (i == j) {
                laplacian(i,j) = diagMatrix[i] - adjacencyMatrix[i][j];
            } else {
                laplacian(i,j) = - adjacencyMatrix[i][j];
            }
        }
    }

    // Eigen values computation
    Eigen::EigenSolver < Eigen::MatrixXd > es;
    es.compute(laplacian, /* computeEigenvectors = */ false);
    std::vector < double > eigenValues;
    for (long i = 0, size = es.eigenvalues().size(); i != size; i++) {
        eigenValues.push_back(es.eigenvalues()(i).real());
    }
    /* sort the eigenValues */
    std::sort(eigenValues.begin(), eigenValues.end());
    /* find the largest gap between two consecutive eigenValues */
    double bestGap = 0;
    long num_clusters = 0;
    for (size_t i = 1, size = eigenValues.size(); i != size; i++) {
        if (std::abs(eigenValues[i-1] - eigenValues[i]) > bestGap) {
            bestGap = std::abs(eigenValues[i-1] - eigenValues[i]);
            num_clusters = long(i);
        }
    }
    return num_clusters;
}

void clusteringSpectral(dlib::matrix<double> &K, long num_clusters,
                        std::vector < uint > &centroidIdsSpec,
                        std::vector < std::vector < uint > > &clusterIdsSpec) {


    for (long r = 0; r < K.nr(); ++r)
        K(r,r) = 0;

    dlib::matrix<double,0,1> D(K.nr());
    for (long r = 0; r < K.nr(); ++r)
        D(r) = sum(rowm(K,r));
    D = sqrt(reciprocal(D));
    K = diagm(D)*K*diagm(D);

    dlib::matrix < double > u,w,v;
    // Use the normal SVD routine unless the matrix is really big, then use the fast
    // approximate version.
    if (K.nr() < 1000)
        svd3(K,u,w,v);
    else
        svd_fast(K,u,w,v, ulong(num_clusters)+100, 5);
    // Pick out the eigenvectors associated with the largest eigenvalues.
    rsort_columns(v,w);
    v = colm(v, dlib::range(0,num_clusters-1));
    // Now build the normalized spectral vectors, one for each input vector.
    std::vector<dlib::matrix<double,0,1> > spec_samps, centersSpec;
    for (long r = 0; r < v.nr(); ++r) {
        spec_samps.push_back(trans(rowm(v,r)));
        const double len = length(spec_samps.back());
        if (len != 0.0)
            spec_samps.back() /= len;
    }
    // Finally do the K-means clustering
    pick_initial_centers(num_clusters, centersSpec, spec_samps);
    find_clusters_using_kmeans(spec_samps, centersSpec);

    //std::cout << "apres spectral clustering, classes : " << std::endl;
    // And then compute the cluster assignments based on the output of K-means.
    clusterIdsSpec = std::vector < std::vector < uint > > (ulong(num_clusters));
    for (unsigned long i = 0; i < spec_samps.size(); ++i) {
        //std::cout << "Complex " << i << " class " << nearest_center(centersSpec, spec_samps[i]) << "\n";
        clusterIdsSpec[nearest_center(centersSpec, spec_samps[i])].push_back(uint(i));
    }

    //std::cout << "centroides : " << std::endl;
    // Pick the best representent of the cluster
    // The nearest point to each center
    for (size_t i = 0; i < centersSpec.size(); ++i) {
        double best_dist = length_squared(centersSpec[i] - spec_samps[0]);
        uint best_idx = 0;
        for (unsigned long j = 1; j < spec_samps.size(); ++j) {
            const double dist = length_squared(centersSpec[i] - spec_samps[j]);
            if (dist < best_dist) {
                best_dist = dist;
                best_idx = uint(j);
            }
        }
        centroidIdsSpec.push_back(best_idx);
        //std::cout << "Centroid #" << i << " Graph #" << best_idx << std::endl;
    }

}