## ---- message=FALSE, warning=FALSE, class.source = 'fold-hide'----------------

## Source required libraries
library(data.table)
library(tidyverse)
library(ggthemes)
library(ggrepel)
library(harmony)
library(patchwork)
library(tidyr)

## Useful util functions

cosine_normalize <- function(X, margin) {
    if (margin == 1) {
        res <- sweep(as.matrix(X), 1, sqrt(rowSums(X ^ 2)), '/')
        row.names(res) <- row.names(X)
        colnames(res) <- colnames(X)        
    } else {
        res <- sweep(as.matrix(X), 2, sqrt(colSums(X ^ 2)), '/')
        row.names(res) <- row.names(X)
        colnames(res) <- colnames(X)
    }
    return(res)
}

onehot <- function(vals) {
    t(model.matrix(~0 + as.factor(vals)))
}


colors_use <- c(`jurkat` = rgb(129, 15, 124, maxColorValue=255),
                `t293` = rgb(208, 158, 45, maxColorValue=255),
                `half` = rgb(0, 109, 44, maxColorValue=255))


do_scatter <- function(umap_use, meta_data, label_name, no_guides = TRUE, do_labels = TRUE, nice_names, 
                       palette_use = colors_use,
                       pt_size = 4, point_size = .5, base_size = 10, do_points = TRUE, do_density = FALSE, h = 4, w = 8) {
    umap_use <- umap_use[, 1:2]
    colnames(umap_use) <- c('X1', 'X2')
    plt_df <- umap_use %>% data.frame() %>% 
        cbind(meta_data) %>% 
        dplyr::sample_frac(1L) 
    plt_df$given_name <- plt_df[[label_name]]
    
    if (!missing(nice_names)) {
        plt_df %<>%
            dplyr::inner_join(nice_names, by = "given_name") %>% 
            subset(nice_name != "" & !is.na(nice_name))

        plt_df[[label_name]] <- plt_df$nice_name        
    }
        
    plt <- plt_df %>% 
        ggplot(aes(X1, X2, colour = .data[[label_name]], fill = .data[[label_name]])) + 
        theme_tufte(base_size = base_size) + 
        theme(panel.background = element_rect(fill = NA, color = "black")) + 
        guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) +
        scale_color_manual(values = palette_use) + 
        scale_fill_manual(values = palette_use) +    
        theme(plot.title = element_text(hjust = .5)) + 
        labs(x = "UMAP 1", y = "UMAP 2") 
    
    if (do_points) 
        plt <- plt + geom_point(size = 0.2)
    if (do_density) 
        plt <- plt + geom_density_2d()    
        

    if (no_guides)
        plt <- plt + guides("none")
    
    if (do_labels) 
        plt <- plt + geom_label_repel(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = label_name], label.size = NA,
                                      aes(label = .data[[label_name]]), color = "white", size = pt_size, alpha = 1, segment.size = 0) + 
        guides(col = FALSE, fill = FALSE)
    return(plt)
}


## -----------------------------------------------------------------------------
data(cell_lines)
V <- cell_lines$scaled_pcs
V_cos <- cosine_normalize(V, 1)
meta_data <- cell_lines$meta_data

## ---- warning=FALSE, fig.width=5, fig.height=3, fig.align="center"------------
do_scatter(V, meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Colored by dataset', x = 'PC1', y = 'PC2') +
do_scatter(V, meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Colored by cell type', x = 'PC1', y = 'PC2') +
NULL

## -----------------------------------------------------------------------------

set.seed(1)
harmonyObj <- harmony::RunHarmony(
    data_mat = V, ## PCA embedding matrix of cells
    meta_data = meta_data, ## dataframe with cell labels
    theta = 1, ## cluster diversity enforcement
    vars_use = 'dataset', ## variable to integrate out
    nclust = 5, ## number of clusters in Harmony model
    max_iter = 0, ## stop after initialization
    return_object = TRUE ## return the full Harmony model object
)


## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------
do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Z_orig', subtitle = 'Euclidean distance', x = 'PC1', y = 'PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Z_cos', subtitle = 'Induced Cosine distance', x = 'PC1', y = 'PC2')


## ---- fig.width=8, fig.height=3, out.width="100%"-----------------------------

harmonyObj$Z_cos %>% t %>% data.frame() %>% 
    cbind(meta_data) %>% 
    tidyr::gather(key, val, X1:X20) %>% 
    ggplot(aes(reorder(gsub('X', 'PC', key), as.integer(gsub('X', '', key))), val)) + 
        geom_boxplot(aes(color = dataset)) + 
        scale_color_manual(values = colors_use) + 
        labs(x = 'PC number', y = 'PC embedding value', title = 'Z_cos (unit scaled PCA embeddings) for all 20 PCs') + 
        theme_tufte(base_size = 10) + geom_rangeframe() + 
        theme(axis.text.x = element_text(angle = 45, hjust = 1))

## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------

cluster_centroids <- harmonyObj$Y

do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = FALSE, do_labels = FALSE) + 
    labs(title = 'Initial kmeans cluster centroids', subtitle = '', x = 'PC1', y = 'PC2') +
    geom_point(
        data = data.frame(t(cluster_centroids)), 
        color = 'black', fill = 'black', alpha = .8,
        shape = 21, size = 6
    ) +
NULL


## -----------------------------------------------------------------------------
cluster_assignment_matrix <- harmonyObj$R


## ---- fig.height=5, fig.width=5-----------------------------------------------
t(harmonyObj$Z_cos) %>% data.frame() %>%
    cbind(meta_data) %>% 
    tibble::rowid_to_column('id') %>% 
    dplyr::inner_join(
        cluster_assignment_matrix %>% t() %>% data.table() %>% 
            tibble::rowid_to_column('id') %>%
            tidyr::gather(cluster, r, -id) %>% 
            dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
        by = 'id'
    ) %>% 
    dplyr::sample_frac(1L) %>% 
    ggplot(aes(X1, X2, color = r)) + 
        geom_point(size=0.2) + 
        theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
        facet_grid(cluster ~ dataset) + 
        scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 
        labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'Initial probabilistic cluster assignments')

## -----------------------------------------------------------------------------
observed_counts <- with(harmonyObj, R %*% t(as.matrix(Phi)))
round(observed_counts)


## -----------------------------------------------------------------------------
## observed counts
round(harmonyObj$O)

## observed counts
round(harmonyObj$E)


## -----------------------------------------------------------------------------
phi_celltype <- onehot(meta_data$cell_type) 
observed_cell_counts <- harmonyObj$R %*% t(phi_celltype)
round(observed_cell_counts)


## -----------------------------------------------------------------------------
harmonyObj$max_iter_kmeans

## -----------------------------------------------------------------------------
## we can specify how many rounds of clustering to do
harmonyObj$max_iter_kmeans <- 10
harmonyObj$cluster_cpp()

## -----------------------------------------------------------------------------
round(harmonyObj$O)

## ---- fig.height=5, fig.width=5-----------------------------------------------
new_cluster_assignment_matrix <- harmonyObj$R

t(harmonyObj$Z_cos) %>% data.frame() %>%
    cbind(meta_data) %>% 
    tibble::rowid_to_column('id') %>% 
    dplyr::inner_join(
        new_cluster_assignment_matrix %>% t() %>% data.table() %>% 
            tibble::rowid_to_column('id') %>%
            tidyr::gather(cluster, r, -id) %>% 
            dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
        by = 'id'
    ) %>% 
    dplyr::sample_frac(1L) %>% 
    ggplot(aes(X1, X2, color = r)) + 
        geom_point(shape = '.') + 
        theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
        facet_grid(cluster ~ dataset) + 
        scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 
        labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'New probabilistic cluster assignments')

## -----------------------------------------------------------------------------
phi_celltype <- onehot(meta_data$cell_type)
observed_cell_counts <- harmonyObj$R %*% t(phi_celltype)
round(observed_cell_counts)

## -----------------------------------------------------------------------------
round(apply(prop.table(observed_cell_counts, 1), 1, min) * 100, 3)

## -----------------------------------------------------------------------------

with(harmonyObj, {
    distance_matrix <- 2 * (1 - t(Y) %*% Z_cos)
    distance_score <- exp(-distance_matrix / as.numeric(sigma))
    diversity_score <- sweep(E / O, 2, theta, '/') %*% as.matrix(Phi)
    ## new assignments are based on distance and diversity
    R_new <- distance_score * diversity_score  
    ## normalize R so each cell sums to 1
    R_new <- prop.table(R_new, 2)    
})


## -----------------------------------------------------------------------------
## with theta = 0
with(harmonyObj, {
    (E / O) ^ 0
})

## -----------------------------------------------------------------------------
## with theta = 1
with(harmonyObj, {
    round((E / O) ^ 1, 2)
})


## -----------------------------------------------------------------------------
## as theta approach infinity
with(harmonyObj, {
    round((E / O) ^ 1e6, 2)
})


## -----------------------------------------------------------------------------
Y_unscaled <- with(harmonyObj, Z_cos %*% t(R))

## -----------------------------------------------------------------------------
Y_new <- cosine_normalize(Y_unscaled, 2)

## -----------------------------------------------------------------------------
harmonyObj$moe_correct_ridge_cpp()

## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------

do_scatter(cosine_normalize(t(harmonyObj$Z_orig), 1), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Z_cos before MoE', x = 'PC1', y = 'PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Z_cos after MoE', x = 'PC1', y = 'PC2')

## ---- fig.width=8, fig.height=3, fig.align="center", out.width="100%"---------

do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Z_orig', subtitle = 'Original PCA embeddings', x = 'PC1', y = 'PC2') +
do_scatter(t(harmonyObj$Z_corr), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Z_corr', subtitle = '= Z_orig - correction_factors', x = 'PC1', y = 'PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = 'Z_cos', subtitle = '= Unit_scaled(Z_corr)', x = 'Scaled PC1', y = 'Scaled PC2') +
NULL

## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------

plt <- data.table(PC1_After = harmonyObj$Z_corr[1, ], PC1_Before = harmonyObj$Z_orig[1, ]) %>% 
    cbind(meta_data) %>% 
    dplyr::sample_frac(1L) %>% 
    ggplot(aes(PC1_Before, PC1_After)) + 
        geom_abline(slope = 1, intercept = 0) + 
        theme_tufte(base_size = 10) + geom_rangeframe() + 
        scale_color_tableau() + 
        guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4))) + 
        NULL

plt + geom_point(shape = '.', aes(color = dataset)) + 
        labs(x = 'PC1 before correction', y = 'PC1 after correction', 
             title = 'PC1 correction for each cell', subtitle = 'Colored by Dataset') + 
plt + geom_point(shape = '.', aes(color = cell_type)) + 
        labs(x = 'PC1 before correction', y = 'PC1 after correction', 
             title = 'PC1 correction for each cell', subtitle = 'Colored by Cell Type') + 
NULL


## ---- echo=TRUE---------------------------------------------------------------

W <- list()
## Convert sparse data structures to dense matrix
Phi.moe <- as.matrix(harmonyObj$Phi_moe)
lambda <- diag(c(harmonyObj$lambda))
## Get beta coeeficients for all the clusters
for (k in 1:harmonyObj$K) {
    W[[k]] <- solve(Phi.moe %*% diag(harmonyObj$R[k, ]) %*% t(Phi.moe) + lambda) %*% (Phi.moe %*% diag(harmonyObj$R[k, ])) %*% t(harmonyObj$Z_orig)
}


## ---- fig.width=5, fig.height=5-----------------------------------------------

cluster_assignment_matrix <- harmonyObj$R

t(harmonyObj$Z_orig) %>% data.frame() %>%
    cbind(meta_data) %>% 
    tibble::rowid_to_column('id') %>% 
    dplyr::inner_join(
        cluster_assignment_matrix %>% t() %>% data.table() %>% 
            tibble::rowid_to_column('id') %>%
            tidyr::gather(cluster, r, -id) %>% 
            dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
        by = 'id'
    ) %>% 
    dplyr::sample_frac(1L) %>% 
    ggplot(aes(X1, X2, color = r)) + 
        geom_point(shape = 0.2) + 
        theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
        facet_grid(cluster ~ dataset) + 
        scale_color_gradient(low = 'grey', breaks = seq(0, 1, .2)) + 
        labs(x = 'PC1', y = 'PC2', title = 'Cluster assigned in original PCA space (Z_orig)')


## -----------------------------------------------------------------------------
plt_list <- lapply(1:harmonyObj$K, function(k) {
    plt_df <- W[[k]] %>% data.frame() %>% 
        dplyr::select(X1, X2)
    ## Append n
    plt_df <- plt_df %>% 
        cbind(
            data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 
                dplyr::rename(x0 = X1, y0 = X2) 
        ) %>%
        cbind(type = c('intercept', unique(meta_data$dataset)))
    plt <- plt_df %>% 
        ggplot() + 
            geom_point(aes(X1, X2),
                       data = t(harmonyObj$Z_orig) %>% data.frame(),
                       size = 0.5,
                       color = 'grey'
            ) + 
            geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 
            scale_color_manual(values = c('intercept' = 'black', colors_use)) + 
            theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
            labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k))
    plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16)))    
    # if (k == harmonyObj$K) {
    # } else {
    #     plt <- plt + guides(color = FALSE)
    # }
    plt
})


## ---- fig.height=6, fig.width=6-----------------------------------------------
Reduce(`+`, plt_list) + 
  patchwork::plot_annotation(title = 'Mixture of experts beta terms before correction (Z_orig)') + 
  plot_layout(ncol = 2)

## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------

plt_list <- lapply(1:harmonyObj$K, function(k) {
    plt_df <- W[[k]] %>% data.frame() %>% 
        dplyr::select(X1, X2)

    plt_df <- plt_df %>% 
        cbind(
            data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 
                dplyr::rename(x0 = X1, y0 = X2) 
        ) %>%
        cbind(type = c('intercept', unique(meta_data$dataset))) 

    plt <- plt_df %>% 
        ggplot() + 
            geom_point(aes(X1, X2),
                data = t(harmonyObj$Z_corr) %>% data.frame(),
                shape = '.', 
                color = 'grey'
            ) + 
            geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 
            scale_color_manual(values = c('intercept' = 'black', colors_use)) + 
            theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
            labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k))
    plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16)))
    plt
})


## ---- fig.height=6, fig.width=6-----------------------------------------------
Reduce(`+`, plt_list) + 
  patchwork::plot_annotation(title = 'Mixture of experts beta terms after correction (Z_corr)') + 
  plot_layout(ncol = 2)

## ---- echo=TRUE---------------------------------------------------------------

Z_i <- harmonyObj$Z_orig[, 5]
Z_i_pred <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) {
    W[[k]] * harmonyObj$Phi_moe[, 5] * harmonyObj$R[k, 5]
})) %>% colSums


## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------
data.table(obs = Z_i, pred = Z_i_pred) %>% 
    tibble::rowid_to_column('PC') %>% 
    ggplot(aes(obs, pred)) + 
        geom_point(shape = 21) + 
        geom_label_repel(aes(label = PC)) + 
        geom_abline(slope = 1, intercept = 0) + 
        theme_tufte() + geom_rangeframe() + 
        labs(x = 'Observed PC score', 'Predicted PC score', title = 'Observed and predicted values of PC scores\nfor cell 5') + 
        NULL        

## -----------------------------------------------------------------------------
delta <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) {
    W[[k]][2:4, ] * harmonyObj$Phi[, 5] * harmonyObj$R[k, 5]
})) %>% colSums

Z_corrected <- harmonyObj$Z_orig[, 5] - delta


## ---- fig.width=3, fig.height=3, fig.align="center"---------------------------


harmonyObj$Z_orig %>% t %>% data.frame() %>% 
    ggplot(aes(X1, X2)) + 
        geom_point(shape = '.') + 
        geom_point(
            data = data.frame(t(harmonyObj$Z_orig[, 5, drop = FALSE])), 
            color = 'red'
        ) + 
        geom_segment(
            data = data.table(x0 = harmonyObj$Z_orig[1, 5], 
                              y0 = harmonyObj$Z_orig[2, 5], 
                              x1 = Z_corrected[1],
                              y1 = Z_corrected[2]), 
            aes(x = x0, y = y0, xend = x1, yend = y1),
            linewidth = 1,
            color = 'red', 
            arrow = arrow(length = unit(0.05, "npc"), type = 'closed')            
        ) + 
        theme_tufte(base_size = 10) + geom_rangeframe() + 
        labs(x = 'PC1', y = 'PC2', title = 'Correction of cell #5')


## -----------------------------------------------------------------------------

harmonyObj <- RunHarmony(
    data_mat = V, ## PCA embedding matrix of cells
    meta_data = meta_data, ## dataframe with cell labels
    theta = 1, ## cluster diversity enforcement
    vars_use = 'dataset', ## (list of) variable(s) we'd like to Harmonize out
    nclust = 50, ## number of clusters in Harmony model
    max_iter = 0, ## don't actually run Harmony, stop after initialization
    return_object = TRUE ## return the full Harmony model object, not just the corrected PCA matrix
)


## ---- message=FALSE, fig.width=5, fig.height=3, fig.align="center"------------

i <- 0

do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
    labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') +
NULL

## ---- fig.width=5, fig.height=3, fig.align="center", message=FALSE------------

for (i in 1:2) {
    harmony:::harmonize(harmonyObj, 1)
    plt <- do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
        labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') +
    do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
        labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') +
    NULL
    plot(plt)
}
    

## -----------------------------------------------------------------------------
sessionInfo()