Skip to content

Instantly share code, notes, and snippets.

@acomjean
Last active November 22, 2015 14:50
Show Gist options
  • Select an option

  • Save acomjean/ab4ef1d5eb85d28649e6 to your computer and use it in GitHub Desktop.

Select an option

Save acomjean/ab4ef1d5eb85d28649e6 to your computer and use it in GitHub Desktop.
import random
from Bio import Entrez
from Bio import SeqIO
#Author: Aram Comjean
#Date: Nov/ 2015
#Hungtington's Disease CAG count
# returns random nucleotide sequence with at least the number of "cag" repeats specified
def generate_random_seq (number_cag_repeats):
# the length of the total string will be length_k + number_cag_repeats * 3
length_k = 10000
# the cag repeats can't start before "off_ends" or within "off_ends" of the end
# eg if off_ends = 100 for a 1000 lenght_k the cag repeat will show up between 100 and 900
off_ends = 20
# get random distance from start for cag repeats.
breakpoint = random.randint(off_ends, (length_k - off_ends))
#create sequence (from Neno's comments). xrange depreciated python 3
random_sequence = ''.join(random.choice('ACTG') for i in xrange(length_k))
#string insert the cag repeats
#http://stackoverflow.com/questions/4022827/how-to-insert-some-string-in-the-given-string-at-given-index-in-python
return random_sequence[:breakpoint] + 'CAG' * number_cag_repeats + random_sequence[breakpoint:]
# returns dictionary with max # sequential CAG repeats and location of that maximum
def find_cag_repeat_info (haystack) :
#we're looking for repeats. note techique requires non- repeating search terms. CAG is one such.
search_seq = 'CAG'
search_seq_length = len(search_seq)
needle = search_seq * 2
# create list with locations of all the repeats. includes overlaps. Searches all frames
# eg [100,301,304,534]
#http://stackoverflow.com/questions/3873361/finding-multiple-occurrences-of-a-string-within-a-string-in-python
match_locations = [n for n in xrange(len(haystack)) if haystack.find(needle, n) == n]
# now look though match locations for locations that are 3 apart as those are multiple repeats one
# codon apart (lenght of search seq) apart
max_position =0
max_length = 0
current_long = 2
previous_position = match_locations.pop(0)
for current_position in match_locations:
if current_position - previous_position == search_seq_length:
current_long += 1
if current_long > max_length:
if current_long == 3:
max_position = current_position
max_length = current_long
else:
current_long = 2
previous_position = current_position
#the start of the max position needs be dialed back since we're looking for doubles
max_position -= search_seq_length
return {'max_length':max_length, 'max_start_position': max_position}
#------------------
# Main
#------------------
#get random sequence with CAGs here
rand_seq = generate_random_seq(40)
print rand_seq
repeat_summary = find_cag_repeat_info(rand_seq)
print repeat_summary
ids =['NM_002111']
for id in ids:
Entrez.email = "acomjean@genetics.med.harvard.edu"
#from what nucleotide, what type, what to return, and what id
handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=id)
#read the fasta
seq_record = SeqIO.read(handle, "gb")
print "--------------------"
print 'ID:', id
print seq_record.seq
repeat_summary = find_cag_repeat_info(seq_record.seq)
print repeat_summary
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment