Seurat单细胞处理流程之三:降维&注释

rm(list = ls())
setwd("/mnt/DEV_8T/zhaozm/seurat全流程/降维注释")
## 设置保存的目录
## 数据目录
data_dir <- "/mnt/DEV_8T/zhaozm/seurat全流程/降维注释/data"
if (!dir.exists(data_dir)) {
  dir.create(data_dir, recursive = TRUE)
}
## 图片目录
img_dir <- "/mnt/DEV_8T/zhaozm/seurat全流程/降维注释/img"

if (!dir.exists(img_dir)) {
  dir.create(img_dir, recursive = TRUE)
}
library(Seurat)
library(dplyr)
library(readr)
library(Matrix)
library(ggplot2)
library(patchwork)
library(tidyverse) 
library(tidydr) 
library(RColorBrewer) 
library(scales) 
library(ggpubr) 
library(ggplotify) 
library(pheatmap) 
library(reshape2) 
library(gplots)
set.seed(123)
载入需要的程序包:SeuratObject

载入需要的程序包:sp


载入程序包:‘SeuratObject’


The following objects are masked from ‘package:base’:

    intersect, t



载入程序包:‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ tidyr::expand() masks Matrix::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ tidyr::pack()   masks Matrix::pack()
✖ tidyr::unpack() masks Matrix::unpack()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

载入程序包:‘scales’


The following object is masked from ‘package:purrr’:

    discard


The following object is masked from ‘package:readr’:

    col_factor



载入程序包:‘reshape2’


The following object is masked from ‘package:tidyr’:

    smiths



载入程序包:‘gplots’


The following object is masked from ‘package:stats’:

    lowess
# 读取上一步的rds文件
pbmc <- read_rds(file = "../数据质控/pbmc_f.rds")
#将表达数据标准化
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

#寻找高变基因
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

top10 <- head(VariableFeatures(pbmc), 10)#查看十个程度最强的高变基因

plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)

plot1 + plot2

Normalizing layer: counts

Finding variable features for layer counts

When using repel, set xnudge and ynudge to 0 for optimal results

Warning message in scale_x_log10():
“log-10 transformation introduced infinite values.”
Warning message in scale_x_log10():
“log-10 transformation introduced infinite values.”

sc_ann_2_1.png

#scale全部基因
pbmc <- ScaleData(pbmc, features = rownames(pbmc))
Centering and scaling data matrix
## 执行PCA降维
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## 保存
write_rds(x = pbmc,file = "./data/pbmc_pca.rds")
PC_ 1 
Positive:  CST3, TYROBP, LST1, AIF1, FTL, FTH1, LYZ, FCN1, S100A9, TYMP 
	   FCER1G, CFD, LGALS1, S100A8, CTSS, LGALS2, SERPINA1, IFITM3, SPI1, CFP 
	   PSAP, IFI30, SAT1, COTL1, S100A11, NPC2, GRN, LGALS3, GSTP1, PYCARD 
Negative:  MALAT1, LTB, IL32, IL7R, CD2, B2M, ACAP1, CD27, STK17A, CTSW 
	   CD247, GIMAP5, AQP3, CCL5, SELL, TRAF3IP3, GZMA, MAL, CST7, ITM2A 
	   MYC, GIMAP7, HOPX, BEX2, LDLRAP1, GZMK, ETS1, ZAP70, TNFAIP8, RIC3 
PC_ 2 
Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1, HLA-DRA, LINC00926, CD79B, HLA-DRB1, CD74 
	   HLA-DMA, HLA-DPB1, HLA-DQA2, CD37, HLA-DRB5, HLA-DMB, HLA-DPA1, FCRLA, HVCN1, LTB 
	   BLNK, P2RX5, IGLL5, IRF8, SWAP70, ARHGAP24, FCGR2B, SMIM14, PPP1R14A, C16orf74 
Negative:  NKG7, PRF1, CST7, GZMB, GZMA, FGFBP2, CTSW, GNLY, B2M, SPON2 
	   CCL4, GZMH, FCGR3A, CCL5, CD247, XCL2, CLIC3, AKR1C3, SRGN, HOPX 
	   TTC38, APMAP, CTSC, S100A4, IGFBP7, ANXA1, ID2, IL32, XCL1, RHOC 
PC_ 3 
Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPB1, HLA-DPA1, CD74, MS4A1, HLA-DRB1, HLA-DRA 
	   HLA-DRB5, HLA-DQA2, TCL1A, LINC00926, HLA-DMB, HLA-DMA, CD37, HVCN1, FCRLA, IRF8 
	   PLAC8, BLNK, MALAT1, SMIM14, PLD4, P2RX5, IGLL5, LAT2, SWAP70, FCGR2B 
Negative:  PPBP, PF4, SDPR, SPARC, GNG11, NRGN, GP9, RGS18, TUBB1, CLU 
	   HIST1H2AC, AP001189.4, ITGA2B, CD9, TMEM40, PTCRA, CA2, ACRBP, MMD, TREML1 
	   NGFRAP1, F13A1, SEPT5, RUFY1, TSC22D1, MPP1, CMTM5, RP11-367G6.3, MYL9, GP1BA 
PC_ 4 
Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1, CD74, HIST1H2AC, HLA-DPB1, PF4, SDPR 
	   TCL1A, HLA-DRB1, HLA-DPA1, HLA-DQA2, PPBP, HLA-DRA, LINC00926, GNG11, SPARC, HLA-DRB5 
	   GP9, AP001189.4, CA2, PTCRA, CD9, NRGN, RGS18, CLU, TUBB1, GZMB 
Negative:  VIM, IL7R, S100A6, IL32, S100A8, S100A4, GIMAP7, S100A10, S100A9, MAL 
	   AQP3, CD2, CD14, FYB, LGALS2, GIMAP4, ANXA1, CD27, FCN1, RBP7 
	   LYZ, S100A11, GIMAP5, MS4A6A, S100A12, FOLR3, TRABD2A, AIF1, IL8, IFI6 
PC_ 5 
Positive:  GZMB, NKG7, S100A8, FGFBP2, GNLY, CCL4, CST7, PRF1, GZMA, SPON2 
	   GZMH, S100A9, LGALS2, CCL3, CTSW, XCL2, CD14, CLIC3, S100A12, RBP7 
	   CCL5, MS4A6A, GSTP1, FOLR3, IGFBP7, TYROBP, TTC38, AKR1C3, XCL1, HOPX 
Negative:  LTB, IL7R, CKB, VIM, MS4A7, AQP3, CYTIP, RP11-290F20.3, SIGLEC10, HMOX1 
	   LILRB2, PTGES3, MAL, CD27, HN1, CD2, GDI2, CORO1B, ANXA5, TUBA1B 
	   FAM110A, ATP1A1, TRADD, PPA1, CCDC109B, ABRACL, CTD-2006K23.1, WARS, VMO1, FYB 
# 定义数据集的“维度”
#NOTE: This process can take a long time for big datasets, comment out for expediency. 
#More approximate techniques such as those implemented in ElbowPlot() can be used to reduce computation time
pbmc <- JackStraw(pbmc, num.replicate = 100,dims = 30)
pbmc <- ScoreJackStraw(pbmc, dims = 1:30)
pdf("./img/elbow_score.pdf",width = 12,height = 8)
JackStrawPlot(pbmc, dims = 1:30)
dev.off()

## JackStraw函数运行时间特别久
Warning message:
“Removed 50411 rows containing missing values or values outside the scale range
(`geom_point()`).”

pdf: 2


#通过筛选贡献小于5%方差和累计贡献90%方差的PC截断点作为曲线拐点
#连续PC之间的差异方差贡献百分比变化小于0.1%的点作为曲线拐点
pct <-pbmc[["pca"]]@stdev /sum(pbmc[["pca"]]@stdev) * 100
cumu <- cumsum(pct)

#选择
pc.use <-min(which(cumu >90&pct < 5)[1],
             sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)])>0.1),decreasing = T)[1]+1)
# 保存为 PDF 文件
pdf("./img/elbow_score2.pdf", width = 12, height = 8)


# 绘制 Elbow Plot 并调整字体大小
ElbowPlot(pbmc, ndims = 25)$data %>% 
  ggplot() +
  geom_point(aes(x = dims, y = stdev)) +
  geom_vline(xintercept = pc.use, color = "#5399C4") +
  theme_bw() +
  labs(
    title = "Elbow plot: quantitative approach", 
    x = "Dimensions", 
    y = "Standard Deviation"
  ) +
  theme(
    plot.title = element_text(size = 18, hjust = 0.5),  # 标题字体大小并居中
    axis.title.x = element_text(size = 16),  # X轴标题字体大小
    axis.title.y = element_text(size = 16),  # Y轴标题字体大小
    axis.text.x = element_text(size = 14),   # X轴刻度字体大小
    axis.text.y = element_text(size = 14)    # Y轴刻度字体大小
  )

# 关闭 PDF 文件
dev.off()


#可以看每个pc的方差
Stdev(pbmc)
## 保存这一步的文件
write_rds(pbmc,file = "./data/dim_res_pbmc.rds")

结果如下: image.png

####调试resolution####
# a、尝试不同分辨率下的细胞分群
library(clustree)
set.seed(123)

pbmc <- FindNeighbors(pbmc, reduction = "pca", dims = 1:15)
pbmc <- FindClusters(
  object = pbmc,
  resolution = c( seq( 0.1, 1, 0.1) ) # 分辨率从 0.1,-1,间隔 0.1 一档
)


# 可视化
clustree(pbmc@meta.data, prefix = "RNA_snn_res.")
## 确定为0.5较为合理
载入需要的程序包:ggraph


载入程序包:‘ggraph’


The following object is masked from ‘package:sp’:

    geometry


Computing nearest neighbor graph

Computing SNN



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9615
Number of communities: 4
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9338
Number of communities: 7
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9097
Number of communities: 8
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8871
Number of communities: 8
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8680
Number of communities: 9
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8525
Number of communities: 9
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8370
Number of communities: 9
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8215
Number of communities: 9
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8059
Number of communities: 9
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 106339

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.7932
Number of communities: 11
Elapsed time: 0 seconds

sc_ann_7_2.png

## 按照最佳的维度和分辨率进行筛选
## 读取之前pca的结果
pbmc <- read_rds(file.path(data_dir,  "pbmc_pca.rds"))
pbmc <- FindNeighbors(pbmc, dims = 1:8)
pbmc <- FindClusters(pbmc, resolution = 0.5)

# umap可视化
pbmc <- RunUMAP(pbmc, dims = 1:15)
p=DimPlot(pbmc, reduction = "umap",label = T)
p

Computing nearest neighbor graph

Computing SNN



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 91842

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8813
Number of communities: 9
Elapsed time: 0 seconds


Warning message:
“The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session”
15:40:34 UMAP embedding parameters a = 0.9922 b = 1.112

15:40:34 Read 2638 rows and found 15 numeric columns

15:40:34 Using Annoy for neighbor search, n_neighbors = 30

15:40:34 Building Annoy index with metric = cosine, n_trees = 50

0%   10   20   30   40   50   60   70   80   90   100%

[----|----|----|----|----|----|----|----|----|----|

*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|

15:40:34 Writing NN index file to temp file /tmp/RtmpzRaEd4/file4242620826088

15:40:34 Searching Annoy index using 1 thread, search_k = 3000

15:40:35 Annoy recall = 100%

15:40:35 Commencing smooth kNN distance calibration using 1 thread
 with target n_neighbors = 30

15:40:36 Initializing from normalized Laplacian + noise (using RSpectra)

15:40:36 Commencing optimization for 500 epochs, with 107666 positive edges

15:40:39 Optimization finished

sc_ann_8_3.png


# 寻找marker基因并对cluster进行重命名
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE,
                               min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)

# top5 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
# top20 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 20, wt = avg_log2FC)
# top50 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 50, wt = avg_log2FC)
# write.csv(top50, file.path(data_dir, "cellmarkertop50.csv"), row.names = FALSE)
# write.csv(top20, file.path(data_dir, "cellmarkertop20.csv"), row.names = FALSE)
write.csv(top10, file.path(data_dir, "cellmarkertop10.csv"), row.names = FALSE)

Calculating cluster 0

Calculating cluster 1

Calculating cluster 2

Calculating cluster 3

Calculating cluster 4

Calculating cluster 5

Calculating cluster 6

Calculating cluster 7

Calculating cluster 8
A grouped_df: 18 × 7
p_valavg_log2FCpct.1pct.2p_val_adjclustergene
<dbl><dbl><dbl><dbl><dbl><fct><chr>
3.518916e-81 2.3816470.4290.109 4.825841e-770CCR7
8.349070e-50 2.1102390.3330.102 1.144991e-450LEF1
3.643533e-138 7.2735170.2980.0044.996741e-1341FOLR3
1.448440e-120 6.7304050.2750.0061.986390e-1161S100A12
3.535948e-57 2.1024400.4230.113 4.849199e-532AQP3
1.572168e-38 1.9881280.2770.068 2.156071e-342CD40LG
2.397625e-272 7.3797570.5640.0093.288103e-2683LINC00926
2.745016e-237 7.1350510.4880.0073.764515e-2333VPREB3
1.200580e-167 4.4367810.5910.0561.646476e-1634GZMK
1.895709e-90 3.5700350.4310.061 2.599775e-864GZMH
6.034160e-214 5.4383280.5090.0108.275247e-2105CDKN1C
6.153970e-169 5.8907280.3730.0058.439554e-1655CKB
1.830897e-267 6.0117860.9860.0702.510892e-2636GZMB
1.867633e-181 6.1972120.4900.0142.561272e-1776AKR1C3
1.476034e-221 8.1207900.5330.0022.024232e-2177SERPINF1
1.010540e-200 7.6005200.8000.0121.385855e-1967FCER1A
0.000000e+0014.2511430.5710.000 0.000000e+008LY6G6F
4.359585e-20613.8185290.3570.0005.978735e-2028RP11-879F14.2
# 保存这一步的pbmc数据
write_rds(pbmc, file = file.path(data_dir, "pbmcumap.rds"))
## 通过网站确定pbmc的注释结果
# 读取上一步的rds文件
pbmc <- read_rds(file.path(data_dir, "pbmcumap.rds"))
unique(pbmc$RNA_snn_res.0.5)
p=DimPlot(pbmc, reduction = "umap",label = T)
p

# 进一步细胞注释
celltype=data.frame(ClusterID=0:16,
                    celltype='unkown')
celltype[celltype$ClusterID %in% c(0),2]='Naive CD4 T'
celltype[celltype$ClusterID %in% c(1),2]='CD14+ Mono'
celltype[celltype$ClusterID %in% c(2),2]='Memory CD4 T'
celltype[celltype$ClusterID %in% c(3),2]='B'
celltype[celltype$ClusterID %in% c(4),2]='CD8 T'
celltype[celltype$ClusterID %in% c(5),2]='FCGR3A+ Mono'
celltype[celltype$ClusterID %in% c(6),2]='NK'
celltype[celltype$ClusterID %in% c(7),2]='DC'
celltype[celltype$ClusterID %in% c(8),2]='Platelet'


celltype
table(celltype$celltype)

#先加一列celltype所有值为空
pbmc@meta.data$celltype = "NA"
###注释
for(i in 1:nrow(celltype)){
  pbmc@meta.data[which(pbmc@meta.data$seurat_clusters == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}

table(pbmc@meta.data$celltype)

Idents(pbmc) <- "seurat_clusters"
Idents(pbmc) <- "celltype"

## 查看dim图
p = DimPlot(pbmc, reduction = "umap",label = T)
p
DimPlot(pbmc, reduction = "umap",label = T,split.by = "group")

## 保存注释之后的文件
write_rds(pbmc ,file = "./data/pbmc注释后.rds")