...
Code Block | ||
---|---|---|
| ||
cds cd my_rnaseq_course cd day_3_partA/gene_expression_exercise exercise # you should have already copied this over |
...
DESeq2 Input:
DESeq2 takes as input count data in several forms: a table form, with each column representing a biological replicate/biological condition. DESEQ2 can also read data directly from htseq results, so we can use the 6 files we generated using htseq as input for DESeq2. The count data must be raw counts of sequencing reads, not already normalized data.
Example:
C1_R1 C1_R2 C1_R3 C2_R1 C2_R2 C2_R2
FBgn0000003 0 0 0 0 0 0
FBgn0000008 92 161 76 70 140 88
FBgn0000014 5 1 0 0 4 0
DESeq2 (with the use of an additional packages called tximport and readr) can read data directly from kallisto abundance files. We will need to provide the location of the 6 abundance files, the sample names associated to each file and a Sample Table that gives the mapping between sample and condtion.
SampleName Condition
DESeq2 Scripts:
- DESeq2 script to work with kallisto count output is provided below.
Code Block | ||
---|---|---|
| ||
R #load libraries library(tximport) library("DESeq2") #Import a file called file_list with all the locations of the abundance.tsv files #eg below: #/stor/SCRATCH/sample1/abundances.tsv #/stor/SCRATCH/sample2/abundances.tsv #/stor/SCRATCH/sample3/abundances.tsv #/stor/SCRATCH/sample4/abundances.tsv files<-as.character(read.table("file_list", header=FALSE)$V1) #look at the data structures files #Import a file called samples with the sample names corresponding to each file in the file_list #eg below: #sample1 #sample2 #sample3 #sample4 #look at the data structures files samples<-as.character(read.table("samples",header=FALSE)$V1) names(files)<-samples #look at the data structures samples files #Import a file called sampletable which is a tab-delimited file that contains each samplename along with the condition #eg below: #samples condition #sample1 alc #sample2 alc #sample3 con #sample4 con sampleTable <-read.table("sampleTable",header=TRUE, row.names=1) #look at the data structure head(sampleTable) #IMPORTANT: MAKE SURE THE SAMPLES AND FILE_LIST ARE IN THE SAME ORDER- SAMPLES SHOULD MATCH UP WITH FILES samples==rownames(sampleTable) #should return TRUE for all #Import a file called tx2gene.csv which a csv file that contains the transcript id to gene id mapping #For Drosophila, this is located at: tx2gene.csv tx2gene <- read.csv("tx2gene.csv") #look at this data structure tx2gene #read in kallisto abundance files, summarizing by gene txi <- tximport(files, type = "kallisto", tx2gene = tx2gene) names(txi) #make a deseq2 object from the kallisto summarized counts ddsMatrix <- DESeqDataSetFromTximport(txi, sampleTable, ~condition) ddsMatrix #Optionally, if you want to save this count matrix as a file write.csv(assay(ddsMatrix), file="genecounts.raw.csv", quote=FALSE) #Optionally, if you want to save normalized counts matrix as a file ddsMatrix<- estimateSizeFactors(ddsMatrix) normalized_counts <- counts(ddsMatrix, normalized=TRUE) write.csv(normalized_counts, file="genecounts.normalized.csv", quote=FALSE) #Optionally, if you want to do variance stabilizing transformation or regularized log transformation on this count matrix and then save as a file: This can become input to things like wgcna, pca vsd<- vst(ddsMatrix, blind=TRUE) write.csv(assay(vsd), file="genecounts.variancestabilized.csv", quote=FALSE) #estimate size factors, dispersion, normalize and perform negative binomial test to compare across conditions dds<-DESeq(ddsMatrix) #collect results from the statistical testing, order by adjusted pvalue and write into an output file res<-results(dds) resOrdered <- res[order(res$padj),] summary(res) write.csv(resOrdered, "deseq2_kallisto_C1_vs_C2.csv") #generate MA plot pdf('MAPlot.pdf') plotMA(dds,ylim=c(-2,2),main="DESeq2") dev.off() #save the R data and history save.image(file="deseq2.kallisto.Rdata") savehistory(file="deseq2.kallisto.Rhistory") |
Code Block | ||
---|---|---|
| ||
#IF YOU DIDNT WANT TO RUN THE SCRIPT LINE BY LINE R CMD BATCH deseq2.kallisto.R |
2. DESeq2 script to work with Htseq count output
Code Block | ||
---|---|---|
| ||
library("DESeq2") #GET HTSEQ COUNTS AND SET UP SAMPLE TABLE directory<-(getwd()) samples <- c("C1_count1.gff", "C1_count2.gff", "C1_count3.gff", "C2_count4.gff", "C2_count5.gff", "C2_count6.gff") conditions <- c("control", "control", "control", "treated","treated","treated") sampleTable<-data.frame(sampleName=samples, fileName=samples, condition=conditions) sampleTable #BUILD A DESEQ2 OBJECT FROM THE HTSEQ COUNTS DATA ddsHTSeq<-DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design=~condition) colData(ddsHTSeq)$condition<-factor(colData(ddsHTSeq)$condition, levels=c("control", "treated")) #LOOK AT THE DESEQ2 OBJECT WE'VE CREATED BY READING IN HTSEQ COUNT FILES ddsHTSeq Optionally, if you want to save this count matrix as a file write.csv(assay(ddsHTSeq), file="genecounts.raw.csv", quote=FALSE) #Optionally, if you want to save normalized counts matrix as a file ddsHTSeq<- estimateSizeFactors(ddsHTSeq) normalized_counts <- counts(ddsHTSeq, normalized=TRUE) write.csv(normalized_counts, file="genecounts.normalized.csv", quote=FALSE) #Optionally, if you want to do variance stabilizing transformation or regularized log transformation on this count matrix and then save as a file: This can become input to things like wgcna, pca vsd<- vst(ddsHTSeq) write.csv(assay(vsd), file="genecounts.variancestabilized.csv") #RUN THE STATISTICAL TEST IN ONE GO- NORMALIZATION, ESTIMATE DISPERSION/VARIANCE AND DO TEST FOR DIFFERENTIAL EXPRESSION dds<-DESeq(ddsHTSeq) res<-results(dds) res<-res[order(res$padj),] mcols(res,use.names=TRUE) summary(res) #GENERATE MA PLOT png('MAPlot_htseq.png') plotMA(dds,ylim=c(-2,2),main="DESeq2") dev.off() #WRITE RESULTS INTO FILE write.csv(as.data.frame(res),file="deseq2_htseq_C1_vs_C2.csv") |
Code Block | ||
---|---|---|
| ||
#IF YOU DIDNT WANT TO RUN THE SCRIPT LINE BY LINE R CMD BATCH deseq2.htseq.R |
If you have counts table *(not from htseq or Kallisto), here's a deseq2 script you could use deseq2.counts.R
...