#start an idev session if you have not
idev
#make a directory to do the analysis in
cds
mkdir rad_intro
cd rad_intro
mkdir in_silico_digest
cd in_silico_digest
#download the reference from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/222/465/GCF_000222465.1_Adig_1.1/GCF_000222465.1_Adig_1.1_genomic.fna.gz
#decompress it
gunzip GCF_000222465.1_Adig_1.1_genomic.fna.gz
#check the command on some different max insert sizes
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,400}CCGG" | wc -l
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,500}CCGG" | wc -l
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,600}CCGG" | wc -l
#use
#This can be done in a loop:
to
run
command#set forup athe seriessizes ofyou maximumwant insertto sizescheck
seq 400 100 1000 > maxInserts.txt
less maxInserts.txt
#build a header for your output
echo -e "insert_range\tNsites" > prediction_results.txt
#loop through your max insert sizes to get predictions for the number of sites
while read max;
do nSites=$(cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,${max}}CCGG" | wc -l);
echo -e "300-${max}\t${nSites}" >> prediction_results.txt;
done < maxInserts.txt
#look at the results
cat prediction_results.txt
#These values are somewhat low
#Does#What changingabout to a four-nucleotide restriction enzyme help?
#for NlaIII
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "CATG.{300,400}CCGG" | wc -l
|