...
Does this really work?
This fake reference has 4 RAD fragments, 3 with 100-300 bp insert sizes and 1 more with a 350 bp insert size for NlaIII and MluCI
...
Code Block | ||||
---|---|---|---|---|
| ||||
#look at the file
less test_reference.fasta
#note this may not work on a Mac, try on TACC if it doesn't
cd in_silico_digestion
cat test_reference.fasta | tr -d "\n" | grep -o -E "CATG.{100,300}AATT" | wc -l
cat test_reference.fasta | tr -d "\n" | grep -o -E "CATG.{100,350}AATT" | wc -l |
...
Code Block | ||||
---|---|---|---|---|
| ||||
#download the reference from NCBI wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/222/465/GCF_000222465.1_Adig_1.1/GCF_000222465.1_Adig_1.1_genomic.fna.gz #decompress it gunzip GCF_000222465.1_Adig_1.1_genomic.fna.gz #check the command on some different max insert sizes cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,400}CCGG" | wc -l cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,500}CCGG" | wc -l cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,600}CCGG" | wc -l #This can be done in a loop: #set up the sizes you want to check seq 400 100200 1000800 > maxInserts.txt lesscat maxInserts.txt #build a header for your output echo -e "insert_range\tNsites" > prediction_results.txt #loop through your max insert sizes to get predictions for the number of sites while read max do echo "calculating for 300-${max}..." nSites=$(cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,${max}}CCGG" | wc -l) echo -e "300-${max}\t${nSites}" >> prediction_results.txt done < maxInserts.txt #look at the results cat prediction_results.txt #These values are somewhat low #What about a four-nucleotide restriction enzyme? #for NlaIII cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "CATG.{300,400}CCGG" | wc -l |