Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
titlesolution
collapsetrue
#move to the output directory we used when demultiplexing
cd sample_fastqs/


#look at size of our output files
ls -lh *.fq


#remove the empty *.rem.* files
rm *.rem.*


#check the number of paired end files we have
ls *.1.fq | wc -l
ls *.2.fq | wc -l


#Are the paired end files for each sample the same size?
wc -l *.fq

Mapping

Now we're ready to map the reads to a reference genome

Code Block
titleMapping
collapsetrue
#check out our reference
less stickleback_chrom3.fasta 


#how many sequences are there?
grep "^>" stickleback_chrom3.fasta | wc -l


#index the reference
module load bwa
bwa index stickleback_chrom3.fasta


#run mapping
bwa mem stickleback_chrom3.fasta sample_fastqs/sample_AACCA-AGCGAC.1.fq sample_fastqs/sample_AACCA-AGCGAC.2.fq > sampleA.sam
bwa mem stickleback_chrom3.fasta sample_fastqs/sample_CGATC-AGCGAC.1.fq sample_fastqs/sample_CGATC-AGCGAC.2.fq > sampleB.sam
bwa mem stickleback_chrom3.fasta sample_fastqs/sample_GCATG-AGCGAC.1.fq sample_fastqs/sample_GCATG-AGCGAC.2.fq > sampleC.sam
bwa mem stickleback_chrom3.fasta sample_fastqs/sample_TCGAT-AGCGAC.1.fq sample_fastqs/sample_TCGAT-AGCGAC.2.fq > sampleD.sam


#assess mapping efficiency
samtools flagstat sampleA.sam
samtools flagstat sampleB.sam
samtools flagstat sampleC.sam
samtools flagstat sampleD.sam


#these results are not spectacular.
#80% mapping efficiency is OK, but the low rate of properly paired reads is disapointing.


#convert the sam files to bam files
samtools sort sampleA.sam -O SAM -o sampleA.bam
samtools sort sampleB.sam -O SAM -o sampleB.bam
samtools sort sampleC.sam -O SAM -o sampleC.bam
samtools sort sampleD.sam -O SAM -o sampleD.bam