Last active
November 22, 2015 14:50
-
-
Save acomjean/ab4ef1d5eb85d28649e6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import random | |
| from Bio import Entrez | |
| from Bio import SeqIO | |
| #Author: Aram Comjean | |
| #Date: Nov/ 2015 | |
| #Hungtington's Disease CAG count | |
| # returns random nucleotide sequence with at least the number of "cag" repeats specified | |
| def generate_random_seq (number_cag_repeats): | |
| # the length of the total string will be length_k + number_cag_repeats * 3 | |
| length_k = 10000 | |
| # the cag repeats can't start before "off_ends" or within "off_ends" of the end | |
| # eg if off_ends = 100 for a 1000 lenght_k the cag repeat will show up between 100 and 900 | |
| off_ends = 20 | |
| # get random distance from start for cag repeats. | |
| breakpoint = random.randint(off_ends, (length_k - off_ends)) | |
| #create sequence (from Neno's comments). xrange depreciated python 3 | |
| random_sequence = ''.join(random.choice('ACTG') for i in xrange(length_k)) | |
| #string insert the cag repeats | |
| #http://stackoverflow.com/questions/4022827/how-to-insert-some-string-in-the-given-string-at-given-index-in-python | |
| return random_sequence[:breakpoint] + 'CAG' * number_cag_repeats + random_sequence[breakpoint:] | |
| # returns dictionary with max # sequential CAG repeats and location of that maximum | |
| def find_cag_repeat_info (haystack) : | |
| #we're looking for repeats. note techique requires non- repeating search terms. CAG is one such. | |
| search_seq = 'CAG' | |
| search_seq_length = len(search_seq) | |
| needle = search_seq * 2 | |
| # create list with locations of all the repeats. includes overlaps. Searches all frames | |
| # eg [100,301,304,534] | |
| #http://stackoverflow.com/questions/3873361/finding-multiple-occurrences-of-a-string-within-a-string-in-python | |
| match_locations = [n for n in xrange(len(haystack)) if haystack.find(needle, n) == n] | |
| # now look though match locations for locations that are 3 apart as those are multiple repeats one | |
| # codon apart (lenght of search seq) apart | |
| max_position =0 | |
| max_length = 0 | |
| current_long = 2 | |
| previous_position = match_locations.pop(0) | |
| for current_position in match_locations: | |
| if current_position - previous_position == search_seq_length: | |
| current_long += 1 | |
| if current_long > max_length: | |
| if current_long == 3: | |
| max_position = current_position | |
| max_length = current_long | |
| else: | |
| current_long = 2 | |
| previous_position = current_position | |
| #the start of the max position needs be dialed back since we're looking for doubles | |
| max_position -= search_seq_length | |
| return {'max_length':max_length, 'max_start_position': max_position} | |
| #------------------ | |
| # Main | |
| #------------------ | |
| #get random sequence with CAGs here | |
| rand_seq = generate_random_seq(40) | |
| print rand_seq | |
| repeat_summary = find_cag_repeat_info(rand_seq) | |
| print repeat_summary | |
| ids =['NM_002111'] | |
| for id in ids: | |
| Entrez.email = "acomjean@genetics.med.harvard.edu" | |
| #from what nucleotide, what type, what to return, and what id | |
| handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=id) | |
| #read the fasta | |
| seq_record = SeqIO.read(handle, "gb") | |
| print "--------------------" | |
| print 'ID:', id | |
| print seq_record.seq | |
| repeat_summary = find_cag_repeat_info(seq_record.seq) | |
| print repeat_summary |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment