Split input data, map to hg19, split by chromosome, and variant call with GATK

This is a template script written by Scott Hunicke-Smith to illustrate how to run exome analysis much faster on lonestar. It only requires two fastq files (paired files) and two parameters. It is NOT optimized, not highly robust, etc.

This bash script needs to be run on a head node somewhere where it won't be killed:

fastExon.sh

#!/bin/bash
# Copyright 2012 Scott Hunicke-Smith and the University of Texas at Austin

module load python
module load bwa
module load samtools
module load java64

r1file=$1
r2file=$2
splitSize=$3
batchSize=$4
queue="normal"

echo "Starting: `date`"
# 1. Split input fastq's as one job; store job #
echo "split -d -l $splitSize $r1file r1. 
split -d -l $splitSize $r2file r2. " > split.script
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j split.script -l split.sge -a DNAdenovo -n split -t 1:00:00
qsub split.sge &> split.sge.sublog
splitJID=`tail -1 split.sge.sublog | awk '{print $3}'`
echo "Submitted $splitJID to split input files at `date`"

echo "Waiting for split to finish"
while qstat | grep $splitJID ; do
 echo `date`
 sleep 30
done

# 2. Move a set of splits into their own directory
i=0
fileList=""
subdirList=""
for file in $( ls r1.* ) ; do
   fileExt="${file##*.}"
   if [ `expr $i % $batchSize` -eq `expr $batchSize - 1` ]
   then
      mkdir b.$i
      subdirList="$subdirList b.$i"
      fileList="$fileList $fileExt"
      for dataFiles in $fileList ; do
         mv r2.$dataFiles b.$i
         mv r1.$dataFiles b.$i
      done
      fileList=""
   else
      fileList="$fileList $fileExt"
   fi
   i=`expr $i + 1`
done
# And the residual set, if any:
for file in $( ls r1.* ) ; do
   mkdir b.$i
   subdirList="$subdirList b.$i"
   mv r1.* b.$i
   mv r2.* b.$i
done


# 3. Launch exome_step1.sh on each split within it's own directory; store job numbers; launch exome_step2.sh to combine chr files
mapJIDs=""
for subdir in $subdirList ; do
  cd $subdir
  echo "Creating launcher for all files in $subdir: `date`"
  rm -f map.sge
  for file in $( ls r1.* ) ; do
     fileExt="${file##*.}"
     echo "Run exome_step1.sh on r1.$fileExt and r2.$fileExt"
     echo "/home1/01057/sphsmith/local/bin/exome_step1.bash r1.$fileExt r2.$fileExt /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta mapped.$fileExt >& mapped.$fileExt.log" >> map.script
  done
  /home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j map.script -l map.sge -a DNAdenovo -n map.$subdir -t 1:00:00 -w 2
  qsub map.sge &> map.sge.sublog
  mapJID=`tail -1 map.sge.sublog | awk '{print $3}'`
  mapJIDs="$mapJIDs,`tail -1 map.sge.sublog | awk '{print $3}'`"
  echo "Submitted $mapJID to split input files in $subdir at `date`"
  cd ..
done
echo "Waiting for mapping to finish"
while qstat | grep $mapJID ; do
  echo `date`
  sleep 30
done

echo "Finished: `date`"

# 4. Launch job to combine final chr files across all directories
echo "Creating launcher for merging by reference sequence: `date`"
subdir=`ls -d b.* | head -1`
subdirExt="${subdir##*.}"
evalcmd="ls b.$subdirExt/*.mapped.*.sorted.bam | awk -F [./] '{print \$3}' | sort | uniq"
refList=`eval $evalcmd`
# Randomize this list so large and small reference sequences are mixed up
refList=`echo $refList | awk 'BEGIN {srand() } {for (i=1;i<=NF;i++) {print rand() "\t" $i "\n"}}' | sort -n | cut -f 2`
echo $refList

rm -f merge.script
for refs in $refList ; do
   echo "Merging $refs "
   echo "samtools merge -f $refs.sorted.bam b.*/$refs.mapped.*.sorted.bam; samtools index $refs.sorted.bam" >> merge.script
done
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j merge.script -l merge.sge -a DNAdenovo -n merge -t 1:00:00 -w 4
echo "Submitting job; queue start contingent on $mapJIDs completing first"
qsub merge.sge -hold_jid $mapJIDs &> merge.sge.sublog
mergeJID=`tail -1 merge.sge.sublog | awk '{print $3}'`
echo "Submitted $mergeJID to merge output files at `date`"
echo "Waiting for merging to finish"
while qstat | grep $mergeJID ; do
  echo `date`
  sleep 30
done

# 5. Launch GATK on each reference sequences' sorted bam file
echo "Creating launcher for merging by reference sequence: `date`"
subdir=`ls -d b.* | head -1`
subdirExt="${subdir##*.}"
evalcmd="ls b.$subdirExt/*.mapped.*.sorted.bam | awk -F [./] '{print \$3}' | sort | uniq"
refList=`eval $evalcmd`
# Randomize this list so large and small reference sequences are mixed up
refList=`echo $refList | awk 'BEGIN {srand() } {for (i=1;i<=NF;i++) {print rand() "\t" $i "\n"}}' | sort -n | cut -f 2`
echo $refList
rm -f variants.script
for refs in $refList ; do
   echo "GATK via exome_step2.bash on $refs.sorted.bam"
   echo "/home1/01057/sphsmith/local/bin/exome_step2.bash $refs.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf $refs >& variants.$refs.log" >> variants.script
done
# Note that the -w 2 option here defines how many GATK's run per node - might need optimization
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j variants.script -l variants.sge -a DNAdenovo -n variants -t 4:00:00 -w 2
sed -i s/'module load launcher'/'module load launcher\nmodule load java64\nmodule load samtools'/ variants.sge
qsub variants.sge -hold_jid $mergeJID &> variants.sge.sublog
variantsJID=`tail -1 variants.sge.sublog | awk '{print $3}'`
echo "Submitted $variantsJID to call variants at `date`"

echo "Waiting for variant calling to finish"
while qstat | grep $variantsJID ; do
  echo `date`
  sleep 30
done

# 6. Merge all bam files & vcf files
echo "samtools merge -f $r1file.sorted.bam *.sorted.bam" > merge2.script
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j merge2.script -l merge2.sge -a DNAdenovo -n merge2 -t 1:00:00 -w 4
qsub merge2.sge -hold_jid $variantsJID &> merge2.sge.sublog
merge2JID=`tail -1 merge2.sge.sublog | awk '{print $3}'`
echo "Submitted $merge2JID to merge output files at `date`"
echo "Waiting for merging to finish"
while qstat | grep $merge2JID ; do
  echo `date`
  sleep 30
done
grep '^#' chrX.sorted.bam.snps.vcf > $r1file.snps.vcf
grep -v '^#' chr*.sorted.bam.snps.vcf >> $r1file.snps.vcf

echo "Fast exon analysis is complete at: `date`"

It uses the TACC "launcher" functionality to do the following:

Create one job on one node which splits the two input fastq files into files with $splitSize lines per file each using split.sge and split.script. Wait for job to finish.
Create as many subdirectories as needed for the split output files to be mapped $batchSize per directory. Lonestar nodes have two sockets with 12 processors per socket, so a good choice here is to make $batchSize two so that the mapping step can use 6 threads.
Create $batchSize * originalFileSize % $splitSize lines in map.script and submit map.sge to do the mapping. Note that the embedded mapping script bwa_mapPair.sh splits the mapping output into chromosome-specific files during the bwa sampe step.
Merge all the chromosome-specific files from these subdirectories back to the run directory using merge.sge and merge.script
Run GATK on sets of these chromosome-specific files, with 2 GATK's per node (hardcoded in script right now) using variants.sge and variants.script; since chromosomes are usually named based on their size (i.e. chr1 < chr2 < chr3, etc.), randomize the list so that we don't wind up with all the big chromosomes on one node.
Merge the final GATK chromosome-specific variant calls - both the BAM files and the VCF files - using merge2.sge and merge2.script.

Examples of the various .sge and .script files are shown below.

Benchmark analysis on ~40 million read pairs from a single human exome experiment show that this script takes about 2 hours vs. about 15 hours if all these same processes are run on only 1 node.

{{split.sge}} and {{split.script}}

split.sge

#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory.  Then, edit the CONTROL_FILE to specify 
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
# 
#         <------ Setup Parameters ------>
#
#$ -N split
#$ -pe 12way 12
#$ -q normal
#$ -o split.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
#   <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
#	#$ -pe <parallel environment> <number of slots> 
#	#$ -l h_rt=hours:minutes:seconds to specify run time limit
# 	#$ -N <job name>
# 	#$ -q <queue name>
# 	#$ -o <job output file>
#	   NOTE: The env variable $JOB_ID contains the job id. 
#
module load launcher
setenv EXECUTABLE     $TACC_LAUNCHER_DIR/init_launcher 
setenv CONTROL_FILE   split.script
setenv WORKDIR        .
# 
# Variable description:
#
#  EXECUTABLE     = full path to the job launcher executable
#  CONTROL_FILE   = text input file which specifies
#                   executable for each process
#                   (should be located in WORKDIR)
#  WORKDIR        = location of working directory
#
#      <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------

#----------------
# Error Checking
#----------------

if ( ! -e $WORKDIR ) then
        echo " "
	echo "Error: unable to change to working directory."
	echo "       $WORKDIR"
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $EXECUTABLE ) then
	echo " "
	echo "Error: unable to find launcher executable $EXECUTABLE."
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $WORKDIR/$CONTROL_FILE ) then
	echo " "
	echo "Error: unable to find input control file $CONTROL_FILE."
	echo " "
	echo "Job not submitted."
	exit
endif


#----------------
# Job Submission
#----------------

cd $WORKDIR/
echo " WORKING DIR:   $WORKDIR/"

$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE

echo " "
echo " Parameteric Job Complete"
echo " "

*********************

split.script

split -d -l 1500000 Sample_5_L003_R1.cat.fastq r1. 
split -d -l 1500000 Sample_5_L003_R2.cat.fastq r2.

Merge the