Single Cell RNA-Seq Script and Data
I've made available some publicly available data containing single cell RNA-Seq data for 5k immune cells as well as a script that runs the Seurat workflow to define cell-type clusters in this data.
You can get the data, R script and results from stampede2 here:
/work2/projects/BioITeam/projects/courses/rnaseq_course/day_5_single_cell_data Script: seuratTotalScript5k.mod.R Data (after preprocessing by cellRanger): filtered_feature_bc_matrix Results generated: results #You can copy over the data to your directory: cds cd my_rnaseq_course cp -r /work2/projects/BioITeam/projects/courses/rnaseq_course/day_5_single_cell_data .
The script we are going to run is an older version (Because Seurat on TACC is an older version). You can kick it off by doing the following:
module load seurat-scripts/ctr-0.0.5--r341_0 #OPEN AN IDEV SESSION: idev -m 120 -q normal -A OTH21164 -r RNAseq2 R CMD BATCH seuratTotalScript5k.mod.3.4.R &
The script does the following (newer versions of the functions given here):
Load Data and Create Seurat Object
Load data and create Seurat objectpbmc_data <- Read10X(data.dir = "filtered_feature_bc_matrix") pbmc <- CreateSeuratObject(counts = pbmc_data)
- Check what the counts matrix looks likeLook at counts matrix
pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30] ##output ## CD3D 4 . 10 . . 1 2 3 1 . . 2 7 1 . . 1 3 . 2 3 . . . . . 3 4 1 5 ## TCL1A . . . . . . . . 1 . . . . . . . . . . . . 1 . . . . . . . . ## MS4A1 . 6 . . . . . . 1 1 1 . . . . . . . . . 36 1 2 . . 2 . . . .
Normalize Data Using SCTransform and Regress out Genes Related to Cell Cycle
Normalize datas.genes <- cc.genes$s.genes g2m.genes <- cc.genes$g2m.genes pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE) pbmc$CC.Difference <- pbmc$S.Score - pbmc$G2M.Score pbmc <- SCTransform(pbmc, vars.to.regress = "CC.Difference")
Identify Highly Variable Features
Identify Highly Variable Features and Generate Plotspbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000) top10 <- head(VariableFeatures(pbmc), 10) pdf("variableFeaturesPlot.pdf") plot1 <- VariableFeaturePlot(pbmc) plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE) CombinePlots(plots = list(plot1, plot2)) dev.off()
Scale the Data
Scale dataall.genes <- rownames(pbmc) pbmc <- ScaleData(pbmc, features = all.genes)
Select Dimensions using PCA (using variable features)
Perform PCA for Dimensionality Reduction#Perform PCA for Dimensionality Reduction pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) #Print and Examine PCA Results in Multiple Formats pdf("findDimensions.pdf") print(pbmc[["pca"]], dims = 1:5, nfeatures = 5) VizDimLoadings(pbmc, dims = 1:2, reduction = "pca") DimPlot(pbmc, reduction = "pca") #Generate Principal Component Heatmaps through PC_30 DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE) DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE) DimHeatmap(pbmc, dims = 16:30, cells = 500, balanced = TRUE) #Create Elbow Plot of Principal Components through PC_30 ElbowPlot(pbmc, ndims = 30, reduction = "pca") dev.off()
Cluster the cells (using the selected number of dimensions)
Cluster the Cells for the selected numeber of Principal Components#Cluster the Cells for the selected numeber of Principal Components (we selected 14 for 5k Data) pbmc <- FindNeighbors(pbmc, dims = 1:14) pbmc <- FindClusters(pbmc, resolution = 0.5) #View the Cluster IDs of the First 5 Cells head(Idents(pbmc), 5)
Visualize the clusters using tSNE or UMAP
Visualize using tSNEpdf("tsne.pdf") pbmcTSNE <- RunTSNE(pbmc, dims = 1:14) DimPlot(pbmcTSNE, reduction = "tsne") dev.off()
Find markers for each cluster
Find Markers for Every Cluster and Print Top 2 Per Clusterpbmc.markers <- FindAllMarkers(pbmcTSNE, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC) write.csv(pbmc.markers, file = "markersAll.csv") #Generate Heatmap of Top 10 Markers per Cluster top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC) DoHeatmap(pbmcTSNE, features = top10$gene, size = 3) + theme(axis.text.y = element_text(size = 5)) + NoLegend()
Identify cells expressing genes of interest
Identify Clusters for Marker Genes of InterestmarkerGenes <- c("CD27", "CCR7", "CD8A", "CD8B", "IL7R", "GZMK", "CD79A", "CD37", "CD160", "NKG7", "GNL$ seuratClusters <- function(output, genes) { for (gene in genes) { print(output[output$gene == gene,]) } } seuratClusters(pbmc.markers, markerGenes) #Visualize Feature Expression on TSNE Plot of Top Gene per Cluster FeaturePlot(pbmcTSNE, features = c("LTB","S100A8","CCL5","NOSIP")) FeaturePlot(pbmcTSNE, features = c("LINC02446","IGKC","GNLY","MT-CO3")) FeaturePlot(pbmcTSNE, features = c("CST3","NKG7","KLRB1","LST1")) FeaturePlot(pbmcTSNE, features = c("PPBP","AC084033.3")) #Visualize Expression Probability Distribution Across Clusters for Top Gene per Cluster VlnPlot(pbmcTSNE, features = c("LTB","S100A8","CCL5","NOSIP")) VlnPlot(pbmcTSNE, features = c("LINC02446","IGKC","GNLY","MT-CO3")) VlnPlot(pbmcTSNE, features = c("CST3","NKG7","KLRB1","LST1")) VlnPlot(pbmcTSNE, features = c("PPBP","AC084033.3"))
- Label clusters as cell types based on expression of known marker genes.
new.cluster.ids <- c("CD4+ Memory Cells", "CD16+ and CD14+ Monocytes", "Regulatory T Cells", "CD8+ T Cells", "N/A (Cluster 4)", "NK Cells", "B Cells", "Monocyte Derived Dendritic Cells", "N/A (Cluster 8)", "N/A (Cluster 9)", "Monocyte Derived Dendritic Cells", "N/A (Cluster 11)", "N/A (Cluster 12)", "Megakaryocyte Progenitors") names(new.cluster.ids) <- levels(pbmcTSNE) pbmcTSNE <- RenameIdents(pbmcTSNE, new.cluster.ids) DimPlot(pbmcTSNE, reduction = "tsne", label = TRUE)
Let's look at the results:
Download this zip file and unzip it on your computer to view the files (Most computers will automatically unzip files).
Welcome to the University Wiki Service! Please use your IID (yourEID@eid.utexas.edu) when prompted for your email address during login or click here to enter your EID. If you are experiencing any issues loading content on pages, please try these steps to clear your browser cache.