Skip to content

Instantly share code, notes, and snippets.

@fedarko
Created May 22, 2025 01:13
Show Gist options
  • Select an option

  • Save fedarko/f209ae6ebdb5eb248795e84af557c6f0 to your computer and use it in GitHub Desktop.

Select an option

Save fedarko/f209ae6ebdb5eb248795e84af557c6f0 to your computer and use it in GitHub Desktop.
Compute all possible variations of a primer sequence
# This code makes it easy to do exact matching of a primer sequence to e.g.
# a 16S rRNA gene sequence -- you can use explode() on the primer sequence
# to get all possible variations of this primer sequence, and then just
# search through your gene to find exact matches to any of these primers.
#
# I know there are real tools that can do this faster (e.g. Primer Prospector),
# but I just needed something quick and dirty.
# Some example sequences with degenerate nucleotides: the EMP 16S primers,
# per https://earthmicrobiome.org/protocols-and-standards/16s/
fwdprimer = "GTGYCAGCMGCCGCGGTAA"
revprimer = "GGACTACNVGGGTWTCTAAT"
def get_poss_nts(degen):
"""Returns all possible A/C/G/T nucleotides represented by a degenerate nucleotide.
References
----------
Based on https://www.bioinformatics.org/sms/iupac.html, as of April 2025.
"""
if degen == "N":
# we check N first because it is the most common of these in practice
return "ACGT"
elif degen == "R":
return "AG"
elif degen == "Y":
return "CT"
elif degen == "S":
return "GC"
elif degen == "W":
return "AT"
elif degen == "K":
return "GT"
elif degen == "M":
return "AC"
elif degen == "B":
return "CGT"
elif degen == "D":
return "AGT"
elif degen == "H":
return "ACT"
elif degen == "V":
return "ACG"
else:
raise ValueError(f"Character {degen} not in IUPAC code?")
def get_degens(primer):
"""Returns a list of (0-indexed position, degenerate nucleotide) pairs.
Example
-------
>>> get_degens("GTGYCAGCMGCCGCGGTAA")
[(3, 'Y'), (8, 'M')]
"""
degens = []
for i, c in enumerate(primer):
if c not in "ACGT":
degens.append((i, c))
return degens
def get_all_position_variations(primer):
"""Returns a list of lists describing all possible nucleotides at
all positions with degenerate nucleotides in a sequence.
Example
-------
>>> get_all_position_variations("GTGYCAGCMGCCGCGGTAA")
[[(3, 'C'), (3, 'T')], [(8, 'A'), (8, 'C')]]
"""
degens = get_degens(primer)
variant_positions = []
for (dpos, dnt) in degens:
poss = get_poss_nts(dnt)
possibilities = [(dpos, n) for n in poss]
variant_positions.append(possibilities)
return variant_positions
def explode(primer):
"""Returns a set of all possible sequences matching a
sequence's degenerate nucleotides.
Example
-------
>>> explode("GTGYCAGCMGCCGCGGTAA")
{'GTGCCAGCAGCCGCGGTAA',
'GTGCCAGCCGCCGCGGTAA',
'GTGTCAGCAGCCGCGGTAA',
'GTGTCAGCCGCCGCGGTAA'}
>>> explode("WWW")
{'AAA', 'AAT', 'ATA', 'ATT', 'TAA', 'TAT', 'TTA', 'TTT'}
"""
primers = set()
vp = get_all_position_variations(primer)
for variant_set in itertools.product(*vp):
p = primer[:]
for (vi, vn) in variant_set:
# python doesn't support in-place string updates so we gotta do this jank
p = p[:vi] + vn + p[vi + 1:]
primers.add(p)
return primers
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment