## ----cache, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(cache=TRUE)

## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
library(BiocStyle)

## ----load-libs, message=FALSE,  warning=FALSE---------------------------------
library(dplyr)
library(purrr)
library(tidyr)
library(scater)
library(muscat)
library(ggplot2)
library(patchwork)

## ----load-data, message=FALSE-------------------------------------------------
library(ExperimentHub)
eh <- ExperimentHub()
query(eh, "Kang")
(sce <- eh[["EH2259"]])

## ----prep-data----------------------------------------------------------------
sce <- sce[rowSums(counts(sce) > 0) > 0, ]
qc <- perCellQCMetrics(sce)
sce <- sce[, !isOutlier(qc$detected, nmads=2, log=TRUE)]
sce <- sce[rowSums(counts(sce) > 1) >= 10, ]
sce$id <- paste0(sce$stim, sce$ind)
sce <- prepSCE(sce, "cell", "id", "stim")
table(sce$cluster_id, sce$group_id)
table(sce$sample_id)

## ----pbs-det------------------------------------------------------------------
pb_sum <- aggregateData(sce,
    assay="counts", fun="sum",
    by=c("cluster_id", "sample_id"))
pb_det <- aggregateData(sce,
    assay="counts", fun="num.detected",
    by=c("cluster_id", "sample_id"))
t(head(assay(pb_det)))

## ----pbs-mds, fig.width=8, fig.height=4, fig.cap="Pseudobulk-level multidimensional scaling (MDS) plot based on (A) sum of counts and (B) sum of binarized counts (i.e., counting the number of detected features) in each cluster-sample."----
pbMDS(pb_sum) + ggtitle("Σ counts") +
pbMDS(pb_det) + ggtitle("# detected") +
plot_layout(guides="collect") +
plot_annotation(tag_levels="A") &
theme(legend.key.size=unit(0.5, "lines"))

## ----pbDD---------------------------------------------------------------------
res_DD <- pbDD(pb_det, min_cells=0, filter="none", verbose=FALSE)

## -----------------------------------------------------------------------------
tbl <- res_DD$table[[1]]
# one data.frame per cluster
names(tbl)

## -----------------------------------------------------------------------------
# view results for 1st cluster
k1 <- tbl[[1]]
head(format(k1[, -ncol(k1)], digits = 2))

## -----------------------------------------------------------------------------
# filter FDR < 5%, |logFC| > 1 & sort by adj. p-value
tbl_fil <- lapply(tbl, \(u)
    filter(u, 
        p_adj.loc < 0.05, 
        abs(logFC) > 1) |>
        arrange(p_adj.loc))

# nb. of DS genes & % of total by cluster
n_de <- vapply(tbl_fil, nrow, numeric(1))
p_de <- format(n_de / nrow(sce) * 100, digits = 3)
data.frame("#DD" = n_de, "%DD" = p_de, check.names = FALSE)

## -----------------------------------------------------------------------------
library(UpSetR)
de_gs_by_k <- map(tbl_fil, "gene")
upset(fromList(de_gs_by_k))

## ----pbDS---------------------------------------------------------------------
res_DS <- pbDS(pb_sum, min_cells=0, filter="none", verbose=FALSE)

## -----------------------------------------------------------------------------
res <- stagewise_DS_DD(res_DS, res_DD, verbose=FALSE)
head(res[[1]][[1]]) # results for 1st cluster

## -----------------------------------------------------------------------------
# for each approach, get adjusted p-values across clusters
ps <- map_depth(res, 2, \(df) {
    data.frame(
        df[, c("gene", "cluster_id")],
        p_adj.stagewise=df$p_adj,
        p_adj.DS=df$res_DS$p_adj.loc,
        p_adj.DD=df$res_DD$p_adj.loc)
}) |> 
    lapply(do.call, what=rbind) |>
    do.call(what=rbind) |>
    data.frame(row.names=NULL)
head(ps)

## ----fig.width=12, fig.height=4-----------------------------------------------
# for each approach & cluster, count number 
# of genes falling below 5% FDR threshold
ns <- lapply(seq(0, 0.2, 0.005), \(th) {
    ps |>
        mutate(th=th) |>
        group_by(cluster_id, th) |>
        summarise(
            .groups="drop",
            across(starts_with("p_"), 
            \(.) sum(. < th, na.rm=TRUE)))
}) |> 
    do.call(what=rbind) |>
    pivot_longer(starts_with("p_"))
ggplot(ns, aes(th, value, col=name)) + 
    geom_line(linewidth=0.8, key_glyph="point") +
    geom_vline(xintercept=0.05, lty=2, linewidth=0.4) +
    guides(col=guide_legend(NULL, override.aes=list(size=3))) +
    labs(x="FDR threshold", y="number of significantly\ndifferential genes") +
    facet_wrap(~cluster_id, scales="free_y", nrow=2) + 
    theme_bw() + theme(
        panel.grid.minor=element_blank(),
        legend.key.size=unit(0.5, "lines"))

## ----upset, fig.width = 5, fig.height = 3, fig.cap = "Upset plot of differential findings (FDR < 0.05) across DS, DD, and stagewise analysis for an exemplary cluster; shown are the 50 most frequent interactions."----
# subset adjuster p-values for cluster of interest
qs <- ps[grep("CD4", ps$cluster_id), grep("p_", names(ps))]
# for each approach, extract genes at 5% FDR threshold
gs <- apply(qs, 2, \(.) ps$gene[. < 0.05])
# visualize set intersections between approaches
UpSetR::upset(UpSetR::fromList(gs), order.by="freq")

## -----------------------------------------------------------------------------
# extract genes unique to stagewise testing
sw <- grep("stagewise", names(gs))
setdiff(gs[[sw]], unlist(gs[-sw]))

## ----session-info-------------------------------------------------------------
sessionInfo()