acomjean/search_cag_seq2.py

## search_cag_seq2.py
import random
from Bio import Entrez
from Bio import SeqIO

#Author: Aram Comjean
#Date: Nov/ 2015
#Hungtington's Disease CAG count


# returns random nucleotide sequence with at least the number of "cag" repeats specified

def generate_random_seq (number_cag_repeats):

    # the length of the total string will be length_k + number_cag_repeats * 3
    length_k = 10000
    # the cag repeats can't start before "off_ends" or within "off_ends" of the end
    # eg if off_ends = 100 for a 1000 lenght_k the cag repeat will show up between 100 and 900
    off_ends = 20

    # get random distance from start for cag repeats.
    breakpoint = random.randint(off_ends, (length_k - off_ends))

    #create sequence (from Neno's comments).  xrange depreciated python 3
    random_sequence = ''.join(random.choice('ACTG') for i in xrange(length_k))

    #string insert the cag repeats

    #http://stackoverflow.com/questions/4022827/how-to-insert-some-string-in-the-given-string-at-given-index-in-python
    return  random_sequence[:breakpoint] + 'CAG' * number_cag_repeats + random_sequence[breakpoint:]


# returns dictionary with max # sequential CAG repeats and location of that maximum

def find_cag_repeat_info (haystack) :

    #we're looking for repeats.  note techique requires non- repeating search terms. CAG is one such.
    search_seq = 'CAG'
    search_seq_length = len(search_seq)

    needle = search_seq * 2

    # create list with locations of all the repeats.  includes overlaps. Searches all frames
    #  eg [100,301,304,534]
    #http://stackoverflow.com/questions/3873361/finding-multiple-occurrences-of-a-string-within-a-string-in-python

    match_locations = [n for n in xrange(len(haystack)) if haystack.find(needle, n) == n]

    # now look though match locations for locations that are 3 apart as those are multiple repeats one
    # codon apart (lenght of search seq) apart
    max_position =0
    max_length = 0
    current_long = 2
    previous_position = match_locations.pop(0)

    for current_position in match_locations:

        if current_position - previous_position == search_seq_length:
            current_long += 1
            if current_long > max_length:
                 if current_long == 3:
                     max_position = current_position
                 max_length = current_long
        else:
            current_long = 2
        previous_position = current_position

    #the start of the  max position needs be dialed back since we're looking for doubles

    max_position -= search_seq_length

    return {'max_length':max_length, 'max_start_position': max_position}


#------------------
# Main
#------------------

#get random sequence with CAGs here
rand_seq = generate_random_seq(40)
print rand_seq

repeat_summary = find_cag_repeat_info(rand_seq)
print repeat_summary


ids  =['NM_002111']

for id in ids:
    Entrez.email = "acomjean@genetics.med.harvard.edu"

    #from what nucleotide, what type, what to return, and what id
    handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=id)

    #read the fasta
    seq_record = SeqIO.read(handle, "gb")
    print "--------------------"
    print 'ID:', id
    print seq_record.seq
    repeat_summary = find_cag_repeat_info(seq_record.seq)
    print repeat_summary
	import random
	from Bio import Entrez
	from Bio import SeqIO

	#Author: Aram Comjean
	#Date: Nov/ 2015
	#Hungtington's Disease CAG count


	# returns random nucleotide sequence with at least the number of "cag" repeats specified

	def generate_random_seq (number_cag_repeats):

	# the length of the total string will be length_k + number_cag_repeats * 3
	length_k = 10000
	# the cag repeats can't start before "off_ends" or within "off_ends" of the end
	# eg if off_ends = 100 for a 1000 lenght_k the cag repeat will show up between 100 and 900
	off_ends = 20

	# get random distance from start for cag repeats.
	breakpoint = random.randint(off_ends, (length_k - off_ends))

	#create sequence (from Neno's comments). xrange depreciated python 3
	random_sequence = ''.join(random.choice('ACTG') for i in xrange(length_k))

	#string insert the cag repeats

	#http://stackoverflow.com/questions/4022827/how-to-insert-some-string-in-the-given-string-at-given-index-in-python
	return random_sequence[:breakpoint] + 'CAG' * number_cag_repeats + random_sequence[breakpoint:]



	# returns dictionary with max # sequential CAG repeats and location of that maximum

	def find_cag_repeat_info (haystack) :

	#we're looking for repeats. note techique requires non- repeating search terms. CAG is one such.
	search_seq = 'CAG'
	search_seq_length = len(search_seq)

	needle = search_seq * 2

	# create list with locations of all the repeats. includes overlaps. Searches all frames
	# eg [100,301,304,534]
	#http://stackoverflow.com/questions/3873361/finding-multiple-occurrences-of-a-string-within-a-string-in-python

	match_locations = [n for n in xrange(len(haystack)) if haystack.find(needle, n) == n]

	# now look though match locations for locations that are 3 apart as those are multiple repeats one
	# codon apart (lenght of search seq) apart
	max_position =0
	max_length = 0
	current_long = 2
	previous_position = match_locations.pop(0)

	for current_position in match_locations:

	if current_position - previous_position == search_seq_length:
	current_long += 1
	if current_long > max_length:
	if current_long == 3:
	max_position = current_position
	max_length = current_long
	else:
	current_long = 2
	previous_position = current_position

	#the start of the max position needs be dialed back since we're looking for doubles

	max_position -= search_seq_length

	return {'max_length':max_length, 'max_start_position': max_position}


	#------------------
	# Main
	#------------------

	#get random sequence with CAGs here
	rand_seq = generate_random_seq(40)
	print rand_seq

	repeat_summary = find_cag_repeat_info(rand_seq)
	print repeat_summary


	ids =['NM_002111']

	for id in ids:
	Entrez.email = "acomjean@genetics.med.harvard.edu"

	#from what nucleotide, what type, what to return, and what id
	handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=id)

	#read the fasta
	seq_record = SeqIO.read(handle, "gb")
	print "--------------------"
	print 'ID:', id
	print seq_record.seq
	repeat_summary = find_cag_repeat_info(seq_record.seq)
	print repeat_summary
No results found