algos/src/k_clustering_big.rs

use std::collections::HashMap;
pub type ImpliciteGraph = Vec<u32>;

fn neighbors(a: u32, n: usize) -> Vec<u32> {
    let mut r = vec![a];
    for _ in 0..n {
        let mut r_new = r.clone();
        for a in r {
            for i in 0..24 {
                let m = 0x1 << i;
                let neighbor = a ^ m;
                r_new.push(neighbor);
            }
        }
        r_new.sort();
        r_new.dedup();
        r = r_new;
    }
    r
}

pub fn k_clustering_big(g: &ImpliciteGraph) -> usize {
    let mut node_id_to_cluster_id: Vec<usize> = (0..g.len()).collect();
    let mut clusters: Vec<Vec<usize>> = (0..g.len()).map(|x| vec![x]).collect();
    let mut node_map: HashMap<u32, Vec<usize>> = HashMap::new();
    let mut cluster_count = g.len();

    for i in 0..g.len() {
        if let Some(x) = node_map.get_mut(&g[i]) {
            x.push(i);
        } else {
            node_map.insert(g[i], vec![i]);
        }
    }

    for node_a_id in 0..g.len() {
        // Iterate over all nodes in the graph. Then, for each node compute all
        // neighbors that are two or less bits away.
        for node_b_value in neighbors(g[node_a_id], 2) {
            // See if there exist nodes that match the neighbor. If such nodes
            // exist iterate over them and merge the clusters if they are not
            // already the same. The key insight is that we have to cluster all
            // nodes that are two or less (that includes zero) bits apart.
            if let Some(node_b_ids) = node_map.get(&node_b_value) {
                for node_b_id in node_b_ids {
                    let cluster_id_a = node_id_to_cluster_id[node_a_id];
                    let cluster_id_b = node_id_to_cluster_id[*node_b_id];
                    if cluster_id_a != cluster_id_b {
                        // Merge b into a. The code is the same as for k_clustering.
                        let mut cluster_b = std::mem::take(&mut clusters[cluster_id_b]);
                        for node_id in &cluster_b {
                            node_id_to_cluster_id[*node_id] = cluster_id_a;
                        }
                        clusters[cluster_id_a].append(&mut cluster_b);
                        cluster_count -= 1;
                    }
                }
            }
        }
    }

    cluster_count
}