Created
September 5, 2018 14:16
-
-
Save seanlaidlaw/b863f5c3934e558aaa99b0d4ed4f3131 to your computer and use it in GitHub Desktop.
Extract DGE tag data from oriented fasta files. By default looks for CATG prefix and prints the 21 characters following it
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import re | |
| import argparse | |
| # Argument Parser | |
| parser = argparse.ArgumentParser(description='Finds DGE tags from a multifasta') | |
| parser.add_argument('-site',nargs="?",const="CATG",default="CATG",help='site the enzyme cuts at, by default CATG') | |
| parser.add_argument('-length',nargs="?",type=int,const=21,default=21,help='length of tag to find, by default 21') | |
| parser.add_argument('file', type=argparse.FileType('r')) | |
| args = parser.parse_args() | |
| # define function to invert DNA sequences | |
| def my_reverse_complement(seq): | |
| return Seq(seq).reverse_complement() | |
| enzyme_site = args.site | |
| k_length = args.length | |
| multifasta_file = args.file | |
| # Make Regex that'll detect the tags | |
| regex_length = args.length - len(enzyme_site) | |
| regex_forward = r"" + enzyme_site + "[ATCG]{" + str(regex_length) +"}" | |
| multifasta_file = multifasta_file.read() | |
| fasta_seqs = multifasta_file.split(">") | |
| for seq in fasta_seqs: | |
| header = seq.split("\n")[0] | |
| header = header.strip() | |
| seq = "".join(seq.split("\n")[1:]) | |
| seq = seq.rstrip() | |
| print(">" + header) | |
| forward_matches = re.finditer(regex_forward, seq, re.MULTILINE) | |
| for matchNum, match in enumerate(forward_matches): | |
| matchNum = matchNum + 1 | |
| print(match.group()) | |
| print(match.group()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment