Introduction to RNA Seq Short Course Commands

Class Slides


Idev session and getting data

Setting up
ssh <username>@stampede.tacc.utexas.edu
 
cds
cp -r /corral-repl/utexas/BioITeam/short_rnaseq_course/ my_short_rnaseq_course 

cd my_short_rnaseq_course/


Step 1:  Evaluate and manipulate raw data

Count and Fastqc
cd quality_exercise
 
head data/Sample1_R1.fastq 

wc -l data/Sample1_R1.fastq 
grep -c '^@HWI' data/Sample1_R1.fastq 

module spider fastqc
module load fastqc
fastqc -h
#DONT RUN ON HEAD NODE:
fastqc data/Sample1_R1.fastq


Look at Fastqc results:

Ideal dataset: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/good_sequence_short_fastqc.html

Our dataset: http://web.corral.tacc.utexas.edu/BioITeam/rnaseq_course/fastqc_exercise/Sample1_R1_fastqc/fastqc_report.html


manipulate fastq files- quality trimming and filtering
module spider fastx
module load fastx_toolkit
#DONT RUN ON HEAD NODE:
fastx_trimmer -i data/Sample1_R1.fastq -l 90 -Q 33 -o Sample1_R1.trimmed.fastq
#DONT RUN ON HEAD NODE:
fastq_quality_filter -q 20 -p 80 -i data/Sample1_R1.fastq -Q 33 -o Sample1_R1.filtered.fastq


grep -c '^@HWI' results/Sample1_R1.trimmed.fastq 
grep -c '^@HWI' results/Sample1_R1.filtered.fastq 

Step 2: Map Raw Data to Reference, Assess results

Map with BWA
cd ..
cd mapping_exercise
module spider bwa
module load bwa/0.7.12


#DONT RUN ON HEAD NODE:
bwa index -a bwtsw reference/transcripts.fasta

bwa mem

#DONT RUN ON HEAD NODE:
bwa mem ../reference/transcripts.fasta ../data/GSM794483_C1_R1_1.fq data/GSM794483_C1_R1_2.fq > C1_R1.mem.sam

Map with Tophat
module spider hisat

#DONT RUN ON HEAD NODE
hisat2-build reference/genome.fa reference/genome.fa

 
#DONT RUN ON HEAD NODE:
hisat2 -x ../reference/genome.fa -1 ../data/GSM794483_C1_R1_1.fq -2 ../data/GSM794483_C1_R1_2.fq -S GSM794483_C1.sam --phred33 --novel-splicesite-outfile GSM794483_C1.junctions --rna-strandness RF --dta -t


Assess Mapping Results
module load samtools

#SYNTAX:
samtools view -b -S samfile > bamfile
samtools sort bamfile sortedbamfile
samtools index sortedbamfile


#BWA RESULTS
samtools flagstat bwa_mem_results_transcriptome/C1_R1.mem.bam
samtools idxstats bwa_mem_results_transcriptome/C1_R1.mem.bam


#How is hisat2 result different from bwa
head -20 hisat_exercise/results/GSM794483_C1.sam
grep -v '^@' bwa_exercise/bwa_mem_results_transcriptome/C1_R1.mem.sam|head|cut -f 6
grep -v '^@' hisat_exercise/results/GSM794483_C1.sam|head|cut -f 6


#DONT RUN ON HEAD NODE
grep -v '^@' hisat_exercise/results/GSM794483_C1.sam|cut -f 6|wc -l

Step 3 and Step 4 :  Quantify transcripts and ID DEGs
Look at bedtools and DESeq results
cd deg_exercise/
 
module swap intel gcc/4.7.1
module load bedtools
bedtools
bedtools multicov
#DONT RUN ON HEAD NODE
bedtools multicov -bams hisat_results/C1_R1.sorted.bam hisat_results/C1_R2.sorted.bam hisat_results/C1_R3.sorted.bam hisat_results/C2_R1.sorted.bam hisat_results/C2_R2.sorted.bam hisat_results/C2_R3.sorted.bam -bed ../reference/genes.formatted.gtf > gene_counts.gff

#What's the difference between bedtools and htseq results?
 
#First look at the annotation file
head ../mapping_exercise/reference/genes.formatted.gtf 

#bedtools result 
head gene_counts.gff 
wc -l gene_counts.gff
 
#htseq result
head gene_counts.htseq.gff 
wc -l gene_counts.htseq.gff
 
#DESEQ2 script in:https://wikis.utexas.edu/display/bioiteam/Testing+for+Differential+Expression
#DESEQ2 result:
head deseq_results/DESeq-C1-vs-C2.csv 
 
#Find the top 10 upregulated genes
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -r -k7,7|cut -f 2,7|head
#If there's an idiosyncracy with sort 
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csvsort -n -r -k7,7|grep -v 'e-0'|cut -f 2,7|head
 
#Find the top 10 downregulated genes
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -k7,7|cut -f 2,7|head
#If there's an idiosyncracy with sort 
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -k7,7|grep -v 'e-0'|cut -f 2,7|head
 
#Count the DEGs
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|awk '{if ((($7>=1)||($7<=-1))&&($9<=0.05)) print}'|wc -l