본문 바로가기

Contact English

【생물정보학】 scater로 cell type 결정하기

 

scater로 cell type 결정하기 (ref)

 

추천글 : 【생물정보학】 생물정보학 분석 목차 


1. 패키지 설치하기 [본문]

2. 데이터 셋팅 [본문]

3. quality control [본문]

4. expression calculation [본문]

5. 차원축소기법 [본문]

6. 커스토마이징 [본문]


a. Cell Type Classification Pipeline

b. Seurat로 cell type 결정하기

c. scanpy로 cell type 결정하기


 

1. 패키지 설치하기 (ref1, ref2) [목차]

 

install.packages("BiocManager")
BiocManager::install("scRNAseq")
browseVignettes("scRNAseq")
# starting httpd help server ... done
BiocManager::install("scater")
browseVignettes("scater")
# starting httpd help server ... done

 

 

2. 데이터 셋팅 [목차]

 

library(scRNAseq)
example_sce <- ZeiselBrainData()
example_sce
# class: SingleCellExperiment 
# dim: 20006 3005 
# metadata(0):
# assays(1): counts
# rownames(20006): Tspan12 Tshz1 ... mt-Rnr1 mt-Nd4l
# rowData names(1): featureType
# colnames(3005): 1772071015_C02 1772071017_G12 ... 1772066098_A12
#   1772058148_F03
# colData names(10): tissue group # ... level1class level2class
# reducedDimNames(0):
# spikeNames(0):
# altExpNames(2): ERCC repeat

str(counts(example_sce))
#  int [1:20006, 1:3005] 0 3 3 0 1 0 0 11 1 0 ...
#  - attr(*, "dimnames")=List of 2
#   ..$ : chr [1:20006] "Tspan12" "Tshz1" "Fnbp1l" "Adamts15" ...
#   ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...

example_sce$whee <- sample(LETTERS, ncol(example_sce), replace=TRUE)
colData(example_sce)
# DataFrame with 3005 rows and 11 columns
#                        tissue   group # total mRNA mol      well       sex
#                   <character> <numeric>      <numeric> <numeric> <numeric>
# 1772071015_C02       sscortex         1           1221         3         3
# 1772071017_G12       sscortex         1           1231        95         1
# 1772071017_A05       sscortex         1           1652        27         1
# 1772071014_B06       sscortex         1           1696        37         3
# 1772067065_H06       sscortex         1           1219        43         3
# ...                       ...       ...            ...       ...       ...
# 1772067059_B04 ca1hippocampus         9           1997        19         1
# 1772066097_D04 ca1hippocampus         9           1415        21         1
# 1772063068_D01       sscortex         9           1876        34         3
# 1772066098_A12 ca1hippocampus         9           1546        88         1
# 1772058148_F03       sscortex         9           1970        15         3
#                      age  diameter        cell_id       level1class
#                <numeric> <numeric>    <character>       <character>
# 1772071015_C02         2         1 1772071015_C02      interneurons
# 1772071017_G12         1       353 1772071017_G12      interneurons
# 1772071017_A05         1        13 1772071017_A05      interneurons
# 1772071014_B06         2        19 1772071014_B06      interneurons
# 1772067065_H06         6        12 1772067065_H06      interneurons
# ...                  ...       ...            ...               ...
# 1772067059_B04         4       382 1772067059_B04 endothelial-mural
# 1772066097_D04         7        12 1772066097_D04 endothelial-mural
# 1772063068_D01         7       268 1772063068_D01 endothelial-mural
# 1772066098_A12         7       324 1772066098_A12 endothelial-mural
# 1772058148_F03         7         6 1772058148_F03 endothelial-mural
#                level2class        whee
#                <character> <character>
# 1772071015_C02       Int10           V
# 1772071017_G12       Int10           K
# 1772071017_A05        Int6           N
# 1772071014_B06       Int10           H
# 1772067065_H06        Int9           Z
# ...                    ...         ...
# 1772067059_B04       Peric           O
# 1772066097_D04        Vsmc           K
# 1772063068_D01        Vsmc           X
# 1772066098_A12        Vsmc           H
# 1772058148_F03        Vsmc           N

rowData(example_sce)$stuff <- runif(nrow(example_sce))
rowData(example_sce)
# DataFrame with 20006 rows and 2 columns
#          featureType             stuff
#          <character>         <numeric>
# Tspan12   endogenous  0.62180774891749
# Tshz1     endogenous 0.473351755877957
# Fnbp1l    endogenous 0.710615682648495
# Adamts15  endogenous 0.189241249114275
# Cldn12    endogenous 0.211544606601819
# ...              ...               ...
# mt-Co2          mito 0.916890940628946
# mt-Co1          mito 0.747661913745105
# mt-Rnr2         mito  0.51556776673533
# mt-Rnr1         mito 0.917173949768767
# mt-Nd4l         mito  0.74331742990762

 

 

3. quality control [목차]

⑴ cell level quality control 

 

library(scater)
per.cell <- perCellQCMetrics(example_sce, subsets = list(Mito = grep("mt-", rownames(example_sce))))
summary(per.cell$sum)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#    2574    8130   12913   14954   19284   63505
summary(per.cell$detected)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#     785    2484    3656    3777    4929    8167 
summary(per.cell$subsets_Mito_percent)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#   0.000   3.992   6.653   7.956  10.290  56.955 
colData(example_sce) <- cbind(colData(example_sce), per.cell)
plotColData(example_sce, x = "sum", y="detected", colour_by="tissue") 
plotColData(example_sce, x = "sum", y="subsets_Mito_percent", other_fields="tissue") + facet_wrap(~tissue)

keep.total <- example_sce$sum > 1e5
keep.n <- example_sce$detected > 500
filtered <- example_sce[,keep.total & keep.n]
keep.total <- isOutlier(per.cell$sum, type="lower", log=TRUE)
filtered <- example_sce[,keep.total]
qc.stats <- quickPerCellQC(per.cell, percent_subsets="subsets_Mito_percent")
colSums(as.matrix(qc.stats))
#              low_lib_size            low_n_features 
#                         0                         3 
# high_subsets_Mito_percent                   discard 
#                       128                       131
filtered <- example_sce[,!qc.stats$discard]

 

⑵ feature level quality control 

 

per.feat <- perFeatureQCMetrics(example_sce, subsets = list(Empty = 1:10))
summary(per.feat$mean)
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#   0.0007   0.0097   0.1338   0.7475   0.5763 732.1524
summary(per.feat$detected)
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#  0.03328  0.76539  9.01830 18.87800 31.24792 99.96672
summary(per.feat$subsets_Empty_ratio)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#   0.000   0.000   0.601   1.872   2.016 300.500
ave <- calculateAverage(example_sce)
summary(ave)
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#   0.0002   0.0109   0.1443   0.7475   0.5674 850.6880
summary(nexprs(example_sce, byrow = TRUE))
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#     1.0    23.0   271.0   567.3   939.0  3004.0

plotHighestExprs(example_sce, exprs_values = "counts")
keep_feature <- nexprs(example_sce, byrow=TRUE) > 0
example_sce <- example_sce[keep_feature,]

 

⑶ variable level quality control

 

example_sce <- logNormCounts(example_sce)
vars <- getVarianceExplained(example_sce, variables = c("tissue", "total mRNA mol", "sex", "age"))
head(vars)
#              tissue total mRNA mol         sex        age
# Tspan12  0.02207262    0.074086504 0.146344996 0.09472155
# Tshz1    3.36083014    0.003846487 0.001079356 0.31262288
# Fnbp1l   0.43597185    0.421086301 0.003071630 0.64964174
# Adamts15 0.54233888    0.005348505 0.030821621 0.01393787
# Cldn12   0.03506751    0.309128294 0.008341408 0.02363737
# Rxfp1    0.18559637    0.016290703 0.055646799 0.02128006
plotExplanatoryVariables(vars)

 

 

4. expression calculation [목차]

 

example_sce <- logNormCounts(example_sce)
assayNames(example_sce)
# [1] "counts"    "logcounts"
summary(librarySizeFactors(example_sce))
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  0.1721  0.5437  0.8635  1.0000  1.2895  4.2466
cpm(example_sce) <- calculateCPM(example_sce)
assay(example_sce, "normed") <- normalizeCounts(example_sce, size_factors=runif(ncol(example_sce)), pseudo_count=1.5)

plotExpression(example_sce, rownames(example_sce)[1:6], x = "level1class")
plotExpression(example_sce, rownames(example_sce)[1:6], x = rownames(example_sce)[10])
plotExpression(example_sce, rownames(example_sce)[1:6], x = "level1class", colour_by="tissue")
plotExpression(example_sce, rownames(example_sce)[1:6])

 

 

5. 차원축소기법 : PCA와 tSNE 중 하나를 선택하면 됨 [목차]

PCA(principal components analysis)

 

example_sce <- runPCA(example_sce)
str(reducedDim(example_sce, "PCA"))
#  num [1:3005, 1:50] 15.4 15 17.2 16.9 18.4 ...
#  - attr(*, "dimnames")=List of 2
#   ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...
#   ..$ : chr [1:50] "PC1" "PC2" "PC3" "PC4" ...
#  - attr(*, "percentVar")= num [1:50] 39.72 9.38 4.25 3.9 2.76 ...

example_sce <- runPCA(example_sce, name="PCA2", subset_row=rownames(example_sce)[1:1000], ncomponents=25)
str(reducedDim(example_sce, "PCA2"))
#  num [1:3005, 1:25] 20 21 23 23.7 21.5 ...
#  - attr(*, "dimnames")=List of 2
#   ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...
#   ..$ : chr [1:25] "PC1" "PC2" "PC3" "PC4" ...
#  - attr(*, "percentVar")= num [1:25] 22.3 5.11 3.42 1.69 1.58 ...

plotReducedDim(example_sce, dimred = "PCA", colour_by = "level1class")

 

Figure. 1. PCA 결과

 

tSNE 

 

example_sce <- runUMAP(example_sce)
head(reducedDim(example_sce, "UMAP"))
#                     [,1]      [,2]
# 1772071015_C02 -10.85709 -6.258108
# 1772071017_G12 -10.94470 -6.291569
# 1772071017_A05 -10.80176 -6.240728
# 1772071014_B06 -10.93094 -6.276943
# 1772067065_H06 -10.94264 -6.278843
# 1772071017_E02 -10.96068 -6.285830

plotTSNE(example_sce, colour_by = "Snap25")

 

Figure. 2. tSNE 결과

 

 

6. 커스토마이징 : 자기 데이터를 scater로 분석하고 싶은 경우 [목차]

 

입력 : 2019.12.20 13:44