Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
titlesolution
collapsetrue
#note this may not work on a Mac, try on TACC if it doesn't
cd in_silico_digest/


cat test_reference.fasta | tr -d "\n" | grep -o -E "CATG.{100,300}AATT" | wc -l
cat test_reference.fasta | tr -d "\n" | grep -o -E "CATG.{100,350}AATT" | wc -l

...

Code Block
titlesolution
collapsetrue
#start an idev session if you have not
idev
 
#make a directory to do the analysis in
cds
mkdir rad_intro
cd rad_intro
mkdir in_silico_digest
cd in_silico_digest
 
#download the reference from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/222/465/GCF_000222465.1_Adig_1.1/GCF_000222465.1_Adig_1.1_genomic.fna.gz
 
#decompress it
gunzip GCF_000222465.1_Adig_1.1_genomic.fna.gz

#check the command on some different max insert sizes
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,400}CCGG" | wc -l
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,500}CCGG" | wc -l
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,600}CCGG" | wc -l
 
#use
#This can be done in a loop:
to
run
command#set forup athe seriessizes ofyou maximumwant insertto sizescheck
seq 400 100 1000 > maxInserts.txt
less maxInserts.txt


#build a header for your output
echo -e "insert_range\tNsites" > prediction_results.txt


#loop through your max insert sizes to get predictions for the number of sites
while read max;
 do nSites=$(cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,${max}}CCGG" | wc -l);
 echo -e "300-${max}\t${nSites}" >> prediction_results.txt;
done < maxInserts.txt
 
#look at the results
cat prediction_results.txt
 
#These values are somewhat low
#Does#What changingabout to a four-nucleotide restriction enzyme help?
#for NlaIII
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "CATG.{300,400}CCGG" | wc -l
 

...