scater로 cell type 결정하기 (ref)
추천글 : 【생물정보학】 생물정보학 분석 목차
1. 패키지 설치하기 [본문]
2. 데이터 셋팅 [본문]
3. quality control [본문]
4. expression calculation [본문]
5. 차원축소기법 [본문]
6. 커스토마이징 [본문]
a. Cell Type Classification Pipeline
install.packages("BiocManager")
BiocManager::install("scRNAseq")
browseVignettes("scRNAseq")
# starting httpd help server ... done
BiocManager::install("scater")
browseVignettes("scater")
# starting httpd help server ... done
2. 데이터 셋팅 [목차]
library(scRNAseq)
example_sce <- ZeiselBrainData()
example_sce
# class: SingleCellExperiment
# dim: 20006 3005
# metadata(0):
# assays(1): counts
# rownames(20006): Tspan12 Tshz1 ... mt-Rnr1 mt-Nd4l
# rowData names(1): featureType
# colnames(3005): 1772071015_C02 1772071017_G12 ... 1772066098_A12
# 1772058148_F03
# colData names(10): tissue group # ... level1class level2class
# reducedDimNames(0):
# spikeNames(0):
# altExpNames(2): ERCC repeat
str(counts(example_sce))
# int [1:20006, 1:3005] 0 3 3 0 1 0 0 11 1 0 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:20006] "Tspan12" "Tshz1" "Fnbp1l" "Adamts15" ...
# ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...
example_sce$whee <- sample(LETTERS, ncol(example_sce), replace=TRUE)
colData(example_sce)
# DataFrame with 3005 rows and 11 columns
# tissue group # total mRNA mol well sex
# <character> <numeric> <numeric> <numeric> <numeric>
# 1772071015_C02 sscortex 1 1221 3 3
# 1772071017_G12 sscortex 1 1231 95 1
# 1772071017_A05 sscortex 1 1652 27 1
# 1772071014_B06 sscortex 1 1696 37 3
# 1772067065_H06 sscortex 1 1219 43 3
# ... ... ... ... ... ...
# 1772067059_B04 ca1hippocampus 9 1997 19 1
# 1772066097_D04 ca1hippocampus 9 1415 21 1
# 1772063068_D01 sscortex 9 1876 34 3
# 1772066098_A12 ca1hippocampus 9 1546 88 1
# 1772058148_F03 sscortex 9 1970 15 3
# age diameter cell_id level1class
# <numeric> <numeric> <character> <character>
# 1772071015_C02 2 1 1772071015_C02 interneurons
# 1772071017_G12 1 353 1772071017_G12 interneurons
# 1772071017_A05 1 13 1772071017_A05 interneurons
# 1772071014_B06 2 19 1772071014_B06 interneurons
# 1772067065_H06 6 12 1772067065_H06 interneurons
# ... ... ... ... ...
# 1772067059_B04 4 382 1772067059_B04 endothelial-mural
# 1772066097_D04 7 12 1772066097_D04 endothelial-mural
# 1772063068_D01 7 268 1772063068_D01 endothelial-mural
# 1772066098_A12 7 324 1772066098_A12 endothelial-mural
# 1772058148_F03 7 6 1772058148_F03 endothelial-mural
# level2class whee
# <character> <character>
# 1772071015_C02 Int10 V
# 1772071017_G12 Int10 K
# 1772071017_A05 Int6 N
# 1772071014_B06 Int10 H
# 1772067065_H06 Int9 Z
# ... ... ...
# 1772067059_B04 Peric O
# 1772066097_D04 Vsmc K
# 1772063068_D01 Vsmc X
# 1772066098_A12 Vsmc H
# 1772058148_F03 Vsmc N
rowData(example_sce)$stuff <- runif(nrow(example_sce))
rowData(example_sce)
# DataFrame with 20006 rows and 2 columns
# featureType stuff
# <character> <numeric>
# Tspan12 endogenous 0.62180774891749
# Tshz1 endogenous 0.473351755877957
# Fnbp1l endogenous 0.710615682648495
# Adamts15 endogenous 0.189241249114275
# Cldn12 endogenous 0.211544606601819
# ... ... ...
# mt-Co2 mito 0.916890940628946
# mt-Co1 mito 0.747661913745105
# mt-Rnr2 mito 0.51556776673533
# mt-Rnr1 mito 0.917173949768767
# mt-Nd4l mito 0.74331742990762
3. quality control [목차]
⑴ cell level quality control
library(scater)
per.cell <- perCellQCMetrics(example_sce, subsets = list(Mito = grep("mt-", rownames(example_sce))))
summary(per.cell$sum)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 2574 8130 12913 14954 19284 63505
summary(per.cell$detected)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 785 2484 3656 3777 4929 8167
summary(per.cell$subsets_Mito_percent)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000 3.992 6.653 7.956 10.290 56.955
colData(example_sce) <- cbind(colData(example_sce), per.cell)
plotColData(example_sce, x = "sum", y="detected", colour_by="tissue")
plotColData(example_sce, x = "sum", y="subsets_Mito_percent", other_fields="tissue") + facet_wrap(~tissue)
keep.total <- example_sce$sum > 1e5
keep.n <- example_sce$detected > 500
filtered <- example_sce[,keep.total & keep.n]
keep.total <- isOutlier(per.cell$sum, type="lower", log=TRUE)
filtered <- example_sce[,keep.total]
qc.stats <- quickPerCellQC(per.cell, percent_subsets="subsets_Mito_percent")
colSums(as.matrix(qc.stats))
# low_lib_size low_n_features
# 0 3
# high_subsets_Mito_percent discard
# 128 131
filtered <- example_sce[,!qc.stats$discard]
⑵ feature level quality control
per.feat <- perFeatureQCMetrics(example_sce, subsets = list(Empty = 1:10))
summary(per.feat$mean)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0007 0.0097 0.1338 0.7475 0.5763 732.1524
summary(per.feat$detected)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.03328 0.76539 9.01830 18.87800 31.24792 99.96672
summary(per.feat$subsets_Empty_ratio)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000 0.000 0.601 1.872 2.016 300.500
ave <- calculateAverage(example_sce)
summary(ave)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0002 0.0109 0.1443 0.7475 0.5674 850.6880
summary(nexprs(example_sce, byrow = TRUE))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.0 23.0 271.0 567.3 939.0 3004.0
plotHighestExprs(example_sce, exprs_values = "counts")
keep_feature <- nexprs(example_sce, byrow=TRUE) > 0
example_sce <- example_sce[keep_feature,]
⑶ variable level quality control
example_sce <- logNormCounts(example_sce)
vars <- getVarianceExplained(example_sce, variables = c("tissue", "total mRNA mol", "sex", "age"))
head(vars)
# tissue total mRNA mol sex age
# Tspan12 0.02207262 0.074086504 0.146344996 0.09472155
# Tshz1 3.36083014 0.003846487 0.001079356 0.31262288
# Fnbp1l 0.43597185 0.421086301 0.003071630 0.64964174
# Adamts15 0.54233888 0.005348505 0.030821621 0.01393787
# Cldn12 0.03506751 0.309128294 0.008341408 0.02363737
# Rxfp1 0.18559637 0.016290703 0.055646799 0.02128006
plotExplanatoryVariables(vars)
4. expression calculation [목차]
example_sce <- logNormCounts(example_sce)
assayNames(example_sce)
# [1] "counts" "logcounts"
summary(librarySizeFactors(example_sce))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.1721 0.5437 0.8635 1.0000 1.2895 4.2466
cpm(example_sce) <- calculateCPM(example_sce)
assay(example_sce, "normed") <- normalizeCounts(example_sce, size_factors=runif(ncol(example_sce)), pseudo_count=1.5)
plotExpression(example_sce, rownames(example_sce)[1:6], x = "level1class")
plotExpression(example_sce, rownames(example_sce)[1:6], x = rownames(example_sce)[10])
plotExpression(example_sce, rownames(example_sce)[1:6], x = "level1class", colour_by="tissue")
plotExpression(example_sce, rownames(example_sce)[1:6])
5. 차원축소기법 : PCA와 tSNE 중 하나를 선택하면 됨 [목차]
⑴ PCA(principal components analysis)
example_sce <- runPCA(example_sce)
str(reducedDim(example_sce, "PCA"))
# num [1:3005, 1:50] 15.4 15 17.2 16.9 18.4 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...
# ..$ : chr [1:50] "PC1" "PC2" "PC3" "PC4" ...
# - attr(*, "percentVar")= num [1:50] 39.72 9.38 4.25 3.9 2.76 ...
example_sce <- runPCA(example_sce, name="PCA2", subset_row=rownames(example_sce)[1:1000], ncomponents=25)
str(reducedDim(example_sce, "PCA2"))
# num [1:3005, 1:25] 20 21 23 23.7 21.5 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...
# ..$ : chr [1:25] "PC1" "PC2" "PC3" "PC4" ...
# - attr(*, "percentVar")= num [1:25] 22.3 5.11 3.42 1.69 1.58 ...
plotReducedDim(example_sce, dimred = "PCA", colour_by = "level1class")
Figure. 1. PCA 결과
⑵ tSNE
example_sce <- runUMAP(example_sce)
head(reducedDim(example_sce, "UMAP"))
# [,1] [,2]
# 1772071015_C02 -10.85709 -6.258108
# 1772071017_G12 -10.94470 -6.291569
# 1772071017_A05 -10.80176 -6.240728
# 1772071014_B06 -10.93094 -6.276943
# 1772067065_H06 -10.94264 -6.278843
# 1772071017_E02 -10.96068 -6.285830
plotTSNE(example_sce, colour_by = "Snap25")
Figure. 2. tSNE 결과
6. 커스토마이징 : 자기 데이터를 scater로 분석하고 싶은 경우 [목차]
입력 : 2019.12.20 13:44
'▶ 자연과학 > ▷ 생물정보학' 카테고리의 다른 글
【생물정보학】 데이터 분석 : Kaplan-Meier 생존 곡선 (0) | 2021.04.13 |
---|---|
【생물정보학】 MIA 분석의 이해 및 실행 (0) | 2021.01.02 |
【생물정보학】 Seurat로 cell type 결정하기 (7) | 2019.11.25 |
【생물정보학】 Cell Type Classification Pipeline (0) | 2019.11.22 |
【생물정보학】 TCGA DATA 얻는 법 (2) | 2019.08.26 |
최근댓글