Single Cell RNA-Seq Script and Data


I've made available some publicly available data containing single cell RNA-Seq data for 5k immune cells as well as a script that runs the Seurat workflow to define cell-type clusters in this data.

You can get the data, R script and results from stampede2 here:

Script and Data locations
/work2/projects/BioITeam/projects/courses/rnaseq_course/day_5_single_cell_data
	Script: seuratTotalScript5k.mod.R
	Data (after preprocessing by cellRanger): filtered_feature_bc_matrix
	Results generated: results

#You can copy over the data to your directory:
cds
cd my_rnaseq_course
cp -r /work2/projects/BioITeam/projects/courses/rnaseq_course/day_5_single_cell_data .


The script we are going to run is an older version (Because Seurat on TACC is an older version). You can kick it off by doing the following:

Load modules and execute R script
module load seurat-scripts/ctr-0.0.5--r341_0

#OPEN AN IDEV SESSION:
idev -m 120 -q normal -A OTH21164 -r RNAseq2

R CMD BATCH seuratTotalScript5k.mod.3.4.R &

The script does the following (newer versions of the functions given here):

  • Load Data and Create Seurat Object

    Load data and create Seurat object
    pbmc_data <- Read10X(data.dir = "filtered_feature_bc_matrix")
    pbmc <- CreateSeuratObject(counts = pbmc_data)
  • Check what the counts matrix looks like
    Look at counts matrix
    pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30]
    
    ##output                                                                   
    ## CD3D  4 . 10 . . 1 2 3 1 . . 2 7 1 . . 1 3 . 2  3 . . . . . 3 4 1 5
    ## TCL1A . .  . . . . . . 1 . . . . . . . . . . .  . 1 . . . . . . . .
    ## MS4A1 . 6  . . . . . . 1 1 1 . . . . . . . . . 36 1 2 . . 2 . . . .
  • Normalize Data Using SCTransform and Regress out Genes Related to Cell Cycle

    Normalize data
    s.genes <- cc.genes$s.genes
    g2m.genes <- cc.genes$g2m.genes
    pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
    pbmc$CC.Difference <- pbmc$S.Score - pbmc$G2M.Score
    pbmc <- SCTransform(pbmc, vars.to.regress = "CC.Difference")
  • Identify Highly Variable Features

    Identify Highly Variable Features and Generate Plots
    pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
    top10 <- head(VariableFeatures(pbmc), 10)
    pdf("variableFeaturesPlot.pdf")
    plot1 <- VariableFeaturePlot(pbmc)
    plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
    CombinePlots(plots = list(plot1, plot2))
    dev.off()
  • Scale the Data

    Scale data
    all.genes <- rownames(pbmc)
    pbmc <- ScaleData(pbmc, features = all.genes)
  • Select Dimensions using PCA (using variable features)

    Perform PCA for Dimensionality Reduction
    #Perform PCA for Dimensionality Reduction
    pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
    
    #Print and Examine PCA Results in Multiple Formats
    pdf("findDimensions.pdf")
    print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
    VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
    DimPlot(pbmc, reduction = "pca")
    
    #Generate Principal Component Heatmaps through PC_30
    DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
    DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
    DimHeatmap(pbmc, dims = 16:30, cells = 500, balanced = TRUE)
    
    #Create Elbow Plot of Principal Components through PC_30
    ElbowPlot(pbmc, ndims = 30, reduction = "pca")
    dev.off()
  • Cluster the cells (using the selected number of dimensions)

    Cluster the Cells for the selected numeber of Principal Components
    #Cluster the Cells for the selected numeber of Principal Components (we selected 14 for 5k Data)
    pbmc <- FindNeighbors(pbmc, dims = 1:14)
    pbmc <- FindClusters(pbmc, resolution = 0.5)
    
    #View the Cluster IDs of the First 5 Cells
    head(Idents(pbmc), 5)
  • Visualize the clusters using tSNE or UMAP

    Visualize using tSNE
    pdf("tsne.pdf")
    pbmcTSNE <- RunTSNE(pbmc, dims = 1:14)
    DimPlot(pbmcTSNE, reduction = "tsne")
    dev.off()
  • Find markers for each cluster

    Find Markers for Every Cluster and Print Top 2 Per Cluster
    pbmc.markers <- FindAllMarkers(pbmcTSNE, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
    pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
    write.csv(pbmc.markers, file = "markersAll.csv")
    
    #Generate Heatmap of Top 10 Markers per Cluster
    top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
    DoHeatmap(pbmcTSNE, features = top10$gene, size = 3) + theme(axis.text.y = element_text(size = 5)) + NoLegend()
  • Identify cells expressing genes of interest 

    Identify Clusters for Marker Genes of Interest
    markerGenes <- c("CD27", "CCR7", "CD8A", "CD8B", "IL7R", "GZMK", "CD79A", "CD37", "CD160", "NKG7", "GNL$
    seuratClusters <- function(output, genes) {
            for (gene in genes) {
            print(output[output$gene == gene,]) }
            }
    seuratClusters(pbmc.markers, markerGenes)
    
    #Visualize Feature Expression on TSNE Plot of Top Gene per Cluster
    FeaturePlot(pbmcTSNE, features = c("LTB","S100A8","CCL5","NOSIP"))
    FeaturePlot(pbmcTSNE, features = c("LINC02446","IGKC","GNLY","MT-CO3"))
    FeaturePlot(pbmcTSNE, features = c("CST3","NKG7","KLRB1","LST1"))
    FeaturePlot(pbmcTSNE, features = c("PPBP","AC084033.3"))
    
    #Visualize Expression Probability Distribution Across Clusters for Top Gene per Cluster
    VlnPlot(pbmcTSNE, features = c("LTB","S100A8","CCL5","NOSIP"))
    VlnPlot(pbmcTSNE, features = c("LINC02446","IGKC","GNLY","MT-CO3"))
    VlnPlot(pbmcTSNE, features = c("CST3","NKG7","KLRB1","LST1"))
    VlnPlot(pbmcTSNE, features = c("PPBP","AC084033.3")) 
  • Label clusters as cell types based on expression of known marker genes.
Generate Labeled TSNE Plot
new.cluster.ids <- c("CD4+ Memory Cells", "CD16+ and CD14+ Monocytes", "Regulatory T Cells", "CD8+ T Cells", "N/A (Cluster 4)", "NK Cells", "B Cells", "Monocyte Derived Dendritic Cells", "N/A (Cluster 8)", "N/A (Cluster 9)", "Monocyte Derived Dendritic Cells", "N/A (Cluster 11)", "N/A (Cluster 12)", "Megakaryocyte Progenitors")
names(new.cluster.ids) <- levels(pbmcTSNE)
pbmcTSNE <- RenameIdents(pbmcTSNE, new.cluster.ids)
DimPlot(pbmcTSNE, reduction = "tsne", label = TRUE)

Let's look at the results: 

Download this zip file and unzip it on your computer to view the files (Most computers will automatically unzip files).

results.zip