Created
May 22, 2025 01:13
-
-
Save fedarko/f209ae6ebdb5eb248795e84af557c6f0 to your computer and use it in GitHub Desktop.
Compute all possible variations of a primer sequence
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This code makes it easy to do exact matching of a primer sequence to e.g. | |
| # a 16S rRNA gene sequence -- you can use explode() on the primer sequence | |
| # to get all possible variations of this primer sequence, and then just | |
| # search through your gene to find exact matches to any of these primers. | |
| # | |
| # I know there are real tools that can do this faster (e.g. Primer Prospector), | |
| # but I just needed something quick and dirty. | |
| # Some example sequences with degenerate nucleotides: the EMP 16S primers, | |
| # per https://earthmicrobiome.org/protocols-and-standards/16s/ | |
| fwdprimer = "GTGYCAGCMGCCGCGGTAA" | |
| revprimer = "GGACTACNVGGGTWTCTAAT" | |
| def get_poss_nts(degen): | |
| """Returns all possible A/C/G/T nucleotides represented by a degenerate nucleotide. | |
| References | |
| ---------- | |
| Based on https://www.bioinformatics.org/sms/iupac.html, as of April 2025. | |
| """ | |
| if degen == "N": | |
| # we check N first because it is the most common of these in practice | |
| return "ACGT" | |
| elif degen == "R": | |
| return "AG" | |
| elif degen == "Y": | |
| return "CT" | |
| elif degen == "S": | |
| return "GC" | |
| elif degen == "W": | |
| return "AT" | |
| elif degen == "K": | |
| return "GT" | |
| elif degen == "M": | |
| return "AC" | |
| elif degen == "B": | |
| return "CGT" | |
| elif degen == "D": | |
| return "AGT" | |
| elif degen == "H": | |
| return "ACT" | |
| elif degen == "V": | |
| return "ACG" | |
| else: | |
| raise ValueError(f"Character {degen} not in IUPAC code?") | |
| def get_degens(primer): | |
| """Returns a list of (0-indexed position, degenerate nucleotide) pairs. | |
| Example | |
| ------- | |
| >>> get_degens("GTGYCAGCMGCCGCGGTAA") | |
| [(3, 'Y'), (8, 'M')] | |
| """ | |
| degens = [] | |
| for i, c in enumerate(primer): | |
| if c not in "ACGT": | |
| degens.append((i, c)) | |
| return degens | |
| def get_all_position_variations(primer): | |
| """Returns a list of lists describing all possible nucleotides at | |
| all positions with degenerate nucleotides in a sequence. | |
| Example | |
| ------- | |
| >>> get_all_position_variations("GTGYCAGCMGCCGCGGTAA") | |
| [[(3, 'C'), (3, 'T')], [(8, 'A'), (8, 'C')]] | |
| """ | |
| degens = get_degens(primer) | |
| variant_positions = [] | |
| for (dpos, dnt) in degens: | |
| poss = get_poss_nts(dnt) | |
| possibilities = [(dpos, n) for n in poss] | |
| variant_positions.append(possibilities) | |
| return variant_positions | |
| def explode(primer): | |
| """Returns a set of all possible sequences matching a | |
| sequence's degenerate nucleotides. | |
| Example | |
| ------- | |
| >>> explode("GTGYCAGCMGCCGCGGTAA") | |
| {'GTGCCAGCAGCCGCGGTAA', | |
| 'GTGCCAGCCGCCGCGGTAA', | |
| 'GTGTCAGCAGCCGCGGTAA', | |
| 'GTGTCAGCCGCCGCGGTAA'} | |
| >>> explode("WWW") | |
| {'AAA', 'AAT', 'ATA', 'ATT', 'TAA', 'TAT', 'TTA', 'TTT'} | |
| """ | |
| primers = set() | |
| vp = get_all_position_variations(primer) | |
| for variant_set in itertools.product(*vp): | |
| p = primer[:] | |
| for (vi, vn) in variant_set: | |
| # python doesn't support in-place string updates so we gotta do this jank | |
| p = p[:vi] + vn + p[vi + 1:] | |
| primers.add(p) | |
| return primers |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment