Resolution of curse of dimensionality (RECODE) is a noise reduction method for single-cell sequencing data. This vignette briefly demonstrates how to run RECODE on single-cell RNA-seq data and integrate the result into Seurat analysis pipeline.
if (!require(RECODE)) devtools::install_github("yusuke-imoto-lab/RECODE/R")
Though we use Seurat v4 in this tutorial, we have checked Seurat v3 also works.
library(Seurat)
library(Matrix)
library(RECODE)
library(ggplot2)
theme_set(theme(text = element_text(size = 18),
panel.background = element_rect(fill = "white", color = "gray20"),
panel.grid = element_line(color = "gray92"),
legend.key = element_rect(colour = NA, fill = NA)))
Sample data: For this tutorial, we use sample 10k Human PBMCs, 3’ v3.1, Chromium Controller (11,485 cells and 36,601 genes) in 10X Genomics Datasets. The test data is deposited as Feature / cell matrix HDF5 (filterd)
here (registration required).
seurat <- Read10X_h5("10k_PBMC_3p_nextgem_Chromium_Controller_filtered_feature_bc_matrix.h5")
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x = counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
seurat <- CreateSeuratObject(seurat)
seurat[["percent.mt"]] <- PercentageFeatureSet(seurat, pattern = "^MT-")
We apply RECODE on count matrix where each row is a gene and each column is a cell. The output is a denoised count matrix.
data <- as.matrix(seurat[["RNA"]]@counts)
data_RECODE <- RECODE(data)
## --START RECODE--
## I. Normalizing
## II. Projecting to PCA space
## III. Modifying eigenvalues
## ell = 166
## Reducing noise
## VI. Reversing to original space
## --END RECODE--
We store the denoised count matrix in seurat[["RECODE"]]@counts
.
seurat[["RECODE"]] <- CreateAssayObject(counts = Matrix(data_RECODE, sparse = TRUE))
DefaultAssay(seurat) <- "RECODE"
We remove low-quality cells using the following thresholds.
ngene_low <- 700
ngene_high <- 6000
mito_high <- 15
numi_low <- 1000
numi_high <- 25000
meta.data <- seurat@meta.data
meta.data$quality <- rep("Good", nrow(meta.data))
meta.data$quality[meta.data$nFeature_RNA<ngene_low|ngene_high<meta.data$nFeature_RNA] <- "Bad"
meta.data$quality[meta.data$nCount_RNA<numi_low|numi_high<meta.data$nCount_RNA] <- "Bad"
meta.data$quality[meta.data$percent.mt>mito_high] <- "Bad"
meta.data$quality <- factor(meta.data$quality)
p1 <- ggplot(meta.data, aes(x = nFeature_RNA, y = percent.mt, color = quality))+
geom_point(size = 0.1, show.legend = F)+
scale_color_manual(values = c("black", "red"))+
xlab("nGene")
p2 <- ggplot(meta.data, aes(x = nCount_RNA, y = nFeature_RNA, color = quality))+
geom_point(size = 0.1)+
scale_color_manual(values = c("black", "red"))+
xlab("nUMI")+
ylab("nGene")+
guides(color = guide_legend(override.aes = list(size = 5)))
p1+p2
VlnPlot(seurat, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3,
pt.size = 0)
seurat <- subset(seurat, subset = nFeature_RNA > ngene_low & nFeature_RNA < ngene_high & percent.mt < mito_high & nCount_RNA > numi_low & nCount_RNA < numi_high)
dim(seurat)
## [1] 36601 11005
Then, we perform library size normalization and run PCA on 2000 highly variable genes. To retain as much biological information as possible, We do not scale gene expression variance to 1 in this tutorial.
seurat <- NormalizeData(seurat)
seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = 2000)
seurat <- ScaleData(seurat, features = rownames(seurat), do.scale = FALSE, do.center = TRUE)
## Centering data matrix
seurat <- RunPCA(seurat, features = VariableFeatures(seurat))
## PC_ 1
## Positive: CCL5, CTSW, GZMM, NKG7, CD247, GNLY, GZMA, KLRK1, CD8B, CST7
## KLRB1, PRF1, CD8A, KLRD1, HOPX, GZMH, KLRG1, SAMD3, C12orf75, LINC02446
## FGFBP2, PYHIN1, GZMK, TRDC, TRGC2, GZMB, LAIR2, MATK, NCR3, GRAP2
## Negative: S100A9, LYZ, S100A8, CST3, FOS, AIF1, VCAN, IFI30, HLA-DRA, LST1
## FTL, SERPINA1, SAT1, FCER1G, CD74, S100A12, TNFSF10, FTH1, MARCKS, HLA-DPA1
## CFD, HLA-DRB1, WARS, IFITM3, CPVL, AC020916.1, CALHM6, LGALS2, GBP1, MAFB
## PC_ 2
## Positive: HLA-DRA, IGHM, IGKC, CD74, HLA-DQA1, HLA-DRB1, CD79A, MS4A1, HLA-DPA1, HLA-DPB1
## CD79B, HLA-DRB5, HLA-DQB1, BANK1, IGHD, RALGPS2, BCL11A, TCL1A, HLA-DMA, LINC00926
## TNFRSF13C, IGLC2, AFF3, VPREB3, CD22, FCER2, TCF4, LINC02397, SWAP70, SPIB
## Negative: S100A9, S100A8, NKG7, GNLY, LYZ, CCL5, GZMA, VCAN, S100A12, FOS
## CTSW, CST7, GZMM, CD247, PRF1, KLRD1, KLRK1, GZMH, FCER1G, FGFBP2
## KLRB1, AC020916.1, KLRG1, AIF1, TRDC, HOPX, MEGF9, PLBD1, SPON2, CYP1B1
## PC_ 3
## Positive: S100A8, S100A12, S100A9, VCAN, LINC02446, CYP1B1, PADI4, FOS, SLC40A1, BEX3
## PLBD1, CKAP4, RBP7, MEGF9, AC020916.1, CD8B, THBS1, MGST1, RETN, F5
## BST1, QPCT, MCEMP1, S100B, CDA, RNASE2, CLEC4E, CRISPLD2, NFE2, GRAP2
## Negative: GNLY, NKG7, CCL5, GZMA, CST7, PRF1, FCGR3A, KLRD1, GZMH, FGFBP2
## GZMB, HLA-DPA1, HOPX, FCER1G, CTSW, HLA-DPB1, SPON2, CD74, CCL4, HLA-DRA
## HLA-DRB1, KLRF1, CLIC3, TRDC, KLRG1, LAIR2, ABI3, IFITM3, HLA-DQA1, KLRB1
## PC_ 4
## Positive: S100A8, S100A9, S100A12, VCAN, LYZ, FOS, IGKC, IGHM, AC020916.1, GNLY
## CYP1B1, ALOX5AP, NKG7, MS4A1, PLBD1, MEGF9, CD79A, RNASE6, CCL5, RBP7
## GZMA, BANK1, CES1, THBS1, CST7, IGHD, GAPT, PADI4, CKAP4, BASP1
## Negative: IFITM3, FCGR3A, CDKN1C, MS4A7, AIF1, WARS, TCF7L2, TNFSF10, GBP1, LST1
## APOBEC3A, CSF1R, CALHM6, TUBA1B, MAFB, SAT1, HLA-DPA1, HES4, FTH1, SERPINA1
## ISG15, SMIM25, ABI3, CFD, RRAS, CTSL, GPBAR1, LILRB1, HMOX1, CXCL10
## PC_ 5
## Positive: CD79A, MS4A1, CD79B, IGHM, TNFSF10, IGHD, LINC00926, MARCKS, BANK1, RALGPS2
## IFIT3, IFIT2, ISG15, TNFRSF13C, VPREB3, MS4A7, FCGR3A, IFITM3, FCER2, GBP1
## IGLC2, CALHM6, MTSS1, SWAP70, CD22, CCL5, PLPP5, IGKC, SAT1, IFIT1
## Negative: GZMB, PPP1R14B, CST3, PLD4, ITM2C, CCDC50, JCHAIN, LILRA4, UGCG, C12orf75
## IRF8, IRF7, TCF4, CLIC3, SERPINF1, CLEC4C, FCER1G, TPM2, MZB1, ALOX5AP
## SCT, APP, IL3RA, PTGDS, RNASE6, DNASE1L3, SMPD3, MAPKAPK2, SELENOS, LRRC26
seurat <- FindNeighbors(seurat, dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
seurat <- FindClusters(seurat, resolution = 1.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11005
## Number of edges: 390955
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8509
## Number of communities: 20
## Elapsed time: 0 seconds
seurat <- RunUMAP(seurat, dims = 1:30, seed.use = 42)
## 01:50:31 UMAP embedding parameters a = 0.9922 b = 1.112
## 01:50:31 Read 11005 rows and found 30 numeric columns
## 01:50:31 Using Annoy for neighbor search, n_neighbors = 30
## 01:50:31 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 01:50:32 Writing NN index file to temp file C:\Users\ashbi\AppData\Local\Temp\Rtmp4WvUPI\file600c40a814ba
## 01:50:32 Searching Annoy index using 1 thread, search_k = 3000
## 01:50:34 Annoy recall = 100%
## 01:50:35 Commencing smooth kNN distance calibration using 1 thread
## 01:50:36 Initializing from normalized Laplacian + noise
## 01:50:36 Commencing optimization for 200 epochs, with 477388 positive edges
## 01:50:45 Optimization finished
DimPlot(seurat, reduction = "umap", label = TRUE, label.size = 5)
DefaultAssay(seurat) <- "RECODE"
FeaturePlot(seurat, features = c("CD3D", "CD4", "CD8B", "CCR7", "GZMK", "PRF1", "KLRB1", "TRDC", "NKG7", "CD14", "FCGR3A", "CD19", "TCL1A", "FCER1A", "LILRA4"), ncol = 4)
use.switch <- function(x)
{
sapply(x, function(y){
switch(y,
"1" = "Naive CD8+ T cells",
"0" = "Naive CD4+ T cells",
"2" = "Memory CD4+ T cells",
"11" = "Central memory CD8+ T cells",
"8" = "Effector memory CD8+ T cells",
"15" = "gd T cells",
"10" = "Natural Killer cells",
"9" = "Memory B cells",
"12" = "Naive B cells",
"18" = "Naive B cells",
"13" = "Naive B cells",
"3" = "CD14+ monocytes",
"4" = "CD14+ monocytes",
"5" = "CD14+ monocytes",
"6" = "CD14+ monocytes",
"14" = "CD14+ monocytes",
"7" = "FCGR3A+ monocytes",
"16" = "Myeloid dendritic cells",
"17" = "Plasmacytoid dendritic cells",
"19" = "Doublets",
y
)
})
}
seurat$cluster <- as.character(seurat$seurat_clusters)
seurat$cluster <- use.switch(seurat$cluster)
seurat$cluster <- factor(seurat$cluster, levels = c("Naive CD8+ T cells", "Naive CD4+ T cells", "Memory CD4+ T cells", "Central memory CD8+ T cells", "Effector memory CD8+ T cells", "gd T cells", "Natural Killer cells", "Memory B cells", "Naive B cells", "CD14+ monocytes", "FCGR3A+ monocytes", "Myeloid dendritic cells", "Plasmacytoid dendritic cells", "Doublets"))
DimPlot(seurat, group.by = "cluster")
seurat <- NormalizeData(seurat, assay = "RNA")
genes <- c("CD14", "FCGR3A")
# Check gene expression value before RECODE
dat <- data.frame(A = seurat[["RNA"]]@data[genes[1], ],
B = seurat[["RNA"]]@data[genes[2], ],
cluster = seurat$cluster)
dat <- dat[(grep("monocytes", dat$cluster)), ]
p1 <- ggplot(dat, aes(x = A, y = B, color = cluster))+
geom_point(size = 0.7)+
xlab(genes[1])+
ylab(genes[2])+
guides(color = guide_legend(override.aes = list(size = 5)))+
ggtitle("Original")
# Check gene expression value after RECODE
dat <- data.frame(A = seurat[["RECODE"]]@data[genes[1], ],
B = seurat[["RECODE"]]@data[genes[2], ],
cluster = seurat$cluster)
dat <- dat[(grep("monocytes", dat$cluster)), ]
p2 <- ggplot(dat, aes(x = A, y = B, color = cluster))+
geom_point(size = 0.7)+
xlab(genes[1])+
ylab(genes[2])+
guides(color = guide_legend(override.aes = list(size = 5)))+
ggtitle("RECODE")
p1/p2
The distribution of two monocyte populations (CD14+ cells and FCGR3A+ cells) is clearly captured with RECODE.
genes <- c("CCR7", "S100A4")
# Check gene expression value before RECODE
dat <- data.frame(A = seurat[["RNA"]]@data[genes[1], ],
B = seurat[["RNA"]]@data[genes[2], ],
cluster = seurat$cluster)
dat <- dat[(grep("CD8", dat$cluster)), ]
p1 <- ggplot(dat, aes(x = A, y = B, color = cluster))+
geom_point(size = 0.7)+
xlab(genes[1])+
ylab(genes[2])+
guides(color = guide_legend(override.aes = list(size = 5)))+
ggtitle("Original")
# Check gene expression value after RECODE
dat <- data.frame(A = seurat[["RECODE"]]@data[genes[1], ],
B = seurat[["RECODE"]]@data[genes[2], ],
cluster = seurat$cluster)
dat <- dat[(grep("CD8", dat$cluster)), ]
p2 <- ggplot(dat, aes(x = A, y = B, color = cluster))+
geom_point(size = 0.7)+
xlab(genes[1])+
ylab(genes[2])+
guides(color = guide_legend(override.aes = list(size = 5)))+
ggtitle("RECODE")
p1/p2
This data includes three CD8+ T cell subsets: naive cells (CCR7+, S100A4-), central memory cells (CCR7+, S100A4+), and effector memory cells (CCR7-, S100A4+). This expression pattern is visible with RECODE.
While the original expression values are sparse and discontinuous due to random sampling effects, RECODE can reconstruct the continuous denoised data, which more accurately reflects the biological information. These two simple examples demonstrate the successful noise reduction by RECODE.
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Japanese_Japan.932 LC_CTYPE=Japanese_Japan.932 LC_MONETARY=Japanese_Japan.932 LC_NUMERIC=C LC_TIME=Japanese_Japan.932
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.3.5 Matrix_1.3-4 SeuratObject_4.0.2 Seurat_4.0.4 RECODE_1.0.0 rmarkdown_2.8 RevoUtils_11.0.2
## [8] RevoUtilsMath_11.0.0
##
## loaded via a namespace (and not attached):
## [1] plyr_1.8.6 igraph_1.2.5 lazyeval_0.2.2 splines_4.0.2 listenv_0.8.0 scattermore_0.7 usethis_2.0.1
## [8] digest_0.6.25 htmltools_0.5.2 fansi_0.4.1 magrittr_2.0.1 memoise_2.0.0 tensor_1.5 cluster_2.1.0
## [15] ROCR_1.0-11 remotes_2.4.0 globals_0.14.0 matrixStats_0.61.0 spatstat.sparse_2.0-0 prettyunits_1.1.1 colorspace_1.4-1
## [22] ggrepel_0.8.2 xfun_0.23 dplyr_1.0.7 callr_3.7.0 crayon_1.4.1 jsonlite_1.7.0 spatstat.data_2.1-0
## [29] survival_3.1-12 zoo_1.8-8 glue_1.4.1 polyclip_1.10-0 gtable_0.3.0 leiden_0.3.9 pkgbuild_1.2.0
## [36] future.apply_1.8.1 abind_1.4-5 scales_1.1.1 DBI_1.1.1 miniUI_0.1.1.1 Rcpp_1.0.7 viridisLite_0.3.0
## [43] xtable_1.8-4 reticulate_1.16 spatstat.core_2.3-0 bit_4.0.4 htmlwidgets_1.5.4 httr_1.4.2 RColorBrewer_1.1-2
## [50] ellipsis_0.3.2 ica_1.0-2 pkgconfig_2.0.3 farver_2.0.3 sass_0.4.0 uwot_0.1.10 deldir_0.2-10
## [57] utf8_1.2.2 tidyselect_1.1.1 labeling_0.4.2 rlang_0.4.11 reshape2_1.4.4 later_1.3.0 munsell_0.5.0
## [64] tools_4.0.2 cachem_1.0.6 cli_3.0.1 generics_0.1.0 devtools_2.4.2 ggridges_0.5.3 evaluate_0.14
## [71] stringr_1.4.0 fastmap_1.1.0 yaml_2.2.1 goftest_1.2-2 processx_3.5.2 knitr_1.34 bit64_4.0.5
## [78] fs_1.5.0 fitdistrplus_1.1-5 purrr_0.3.4 RANN_2.6.1 pbapply_1.5-0 future_1.22.1 nlme_3.1-148
## [85] mime_0.11 hdf5r_1.3.4 compiler_4.0.2 plotly_4.9.4.1 png_0.1-7 testthat_3.0.4 spatstat.utils_2.2-0
## [92] tibble_3.0.3 bslib_0.3.0 stringi_1.4.6 highr_0.9 ps_1.6.0 RSpectra_0.16-0 desc_1.3.0
## [99] lattice_0.20-41 vctrs_0.3.8 pillar_1.6.2 lifecycle_1.0.0 spatstat.geom_2.2-2 lmtest_0.9-37 jquerylib_0.1.4
## [106] RcppAnnoy_0.0.18 data.table_1.12.8 cowplot_1.1.1 irlba_2.3.3 httpuv_1.6.3 patchwork_1.0.1 R6_2.4.1
## [113] promises_1.2.0.1 KernSmooth_2.23-17 gridExtra_2.3 parallelly_1.28.1 sessioninfo_1.1.1 codetools_0.2-16 MASS_7.3-51.6
## [120] assertthat_0.2.1 pkgload_1.2.2 rprojroot_2.0.2 withr_2.4.2 sctransform_0.3.2 mgcv_1.8-31 parallel_4.0.2
## [127] grid_4.0.2 rpart_4.1-15 tidyr_1.1.0 Rtsne_0.15 shiny_1.5.0