Last active
July 14, 2022 17:28
-
-
Save cory-weller/3e97a95157e2600d6e5dd0335c7680b7 to your computer and use it in GitHub Desktop.
finds a header within a given fasta file, then prints the desired nucleotide ranges to STDOUT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| '''finds a header within a given fasta file, then prints the desired nucleotide ranges to STDOUT''' | |
| import sys | |
| import argparse | |
| import regex as re | |
| def wrap_fasta(seq): | |
| '''wraps sequence every 80 characters with newlines''' | |
| return '\n'.join([seq[x:x+80] for x in range(0,len(seq),80)]) | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('fasta', type=str, | |
| help=""" | |
| Filename for fasta file. Can be gzipped ('.gz') | |
| or uncompressed ('.fa', 'fsa', 'fasta') | |
| """ | |
| ) | |
| parser.add_argument('start', type=int, | |
| help=""" | |
| 1-indexed start position to begin with (inclusive) | |
| """ | |
| ) | |
| parser.add_argument('stop', type=int, | |
| help=""" | |
| 1-indexed stop position to end at (inclusive) | |
| """ | |
| ) | |
| parser.add_argument('--header-search', | |
| type=str, | |
| nargs='?', | |
| const=1, | |
| default=None, | |
| help=""" | |
| Pattern to search for in headers in order to determine which | |
| sequence to use for extracting the range | |
| """ | |
| ) | |
| parser.add_argument('--header-output', | |
| type=str, | |
| nargs='?', | |
| const=1, | |
| help=""" | |
| Name to include in output header, followed by <start>-<stop> | |
| """ | |
| ) | |
| args = parser.parse_args() | |
| if args.header_search is None: | |
| header_pattern = '>' | |
| else: | |
| header_pattern = args.header_search + r"[^IVX]+" | |
| header_pattern = re.compile(header_pattern) | |
| fasta_filename = args.fasta.split("/")[-1] | |
| out_header = re.split('.fasta|.fa|.fsa', fasta_filename)[0] | |
| out_header += '_' + args.header_search +':'+ str(args.start) + '-' + str(args.stop) | |
| out_seq = '' | |
| store_seq = False | |
| found_headers = [] | |
| with open(args.fasta, 'r', encoding='utf-8') as infile: | |
| for line in infile: | |
| if line.startswith('>'): | |
| if bool(re.search(header_pattern, line)): | |
| store_seq = True | |
| found_headers.append(line.strip()) | |
| else: | |
| store_seq = False | |
| if len(found_headers) > 1: | |
| print("Found multiple matching headers:") | |
| print("\n".join(found_headers)) | |
| print("Exiting") | |
| sys.exit() | |
| continue | |
| if store_seq is True: | |
| out_seq += line.strip() | |
| if len(found_headers) == 0: | |
| print("no matches found") | |
| sys.exit() | |
| if len(out_seq) == 0: | |
| print("seq is of length 0") | |
| sys.exit() | |
| wanted_region = out_seq[(args.start-1) : args.stop] | |
| if len(wanted_region) == 0: | |
| print("extracted region is of length 0") | |
| sys.exit() | |
| if args.header_output: | |
| print('>' + args.header_output) | |
| else: | |
| print('>' + out_header) | |
| print(wrap_fasta(wanted_region)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment