fedarko/explode_primer.py

## explode_primer.py
# This code makes it easy to do exact matching of a primer sequence to e.g.
# a 16S rRNA gene sequence -- you can use explode() on the primer sequence
# to get all possible variations of this primer sequence, and then just
# search through your gene to find exact matches to any of these primers.
#
# I know there are real tools that can do this faster (e.g. Primer Prospector),
# but I just needed something quick and dirty.

# Some example sequences with degenerate nucleotides: the EMP 16S primers,
# per https://earthmicrobiome.org/protocols-and-standards/16s/
fwdprimer = "GTGYCAGCMGCCGCGGTAA"
revprimer = "GGACTACNVGGGTWTCTAAT"

def get_poss_nts(degen):
    """Returns all possible A/C/G/T nucleotides represented by a degenerate nucleotide.

    References
    ----------
    Based on https://www.bioinformatics.org/sms/iupac.html, as of April 2025.
    """
    if degen == "N":
        # we check N first because it is the most common of these in practice
        return "ACGT"
    elif degen == "R":
        return "AG"
    elif degen == "Y":
        return "CT"
    elif degen == "S":
        return "GC"
    elif degen == "W":
        return "AT"
    elif degen == "K":
        return "GT"
    elif degen == "M":
        return "AC"
    elif degen == "B":
        return "CGT"
    elif degen == "D":
        return "AGT"
    elif degen == "H":
        return "ACT"
    elif degen == "V":
        return "ACG"
    else:
        raise ValueError(f"Character {degen} not in IUPAC code?")

def get_degens(primer):
    """Returns a list of (0-indexed position, degenerate nucleotide) pairs.

    Example
    -------
    >>> get_degens("GTGYCAGCMGCCGCGGTAA")
    [(3, 'Y'), (8, 'M')]
    """
    degens = []
    for i, c in enumerate(primer):
        if c not in "ACGT":
            degens.append((i, c))
    return degens

def get_all_position_variations(primer):
    """Returns a list of lists describing all possible nucleotides at
    all positions with degenerate nucleotides in a sequence.

    Example
    -------
    >>> get_all_position_variations("GTGYCAGCMGCCGCGGTAA")
    [[(3, 'C'), (3, 'T')], [(8, 'A'), (8, 'C')]]
    """
    degens = get_degens(primer)
    variant_positions = []
    for (dpos, dnt) in degens:
        poss = get_poss_nts(dnt)
        possibilities = [(dpos, n) for n in poss]
        variant_positions.append(possibilities)
    return variant_positions

def explode(primer):
    """Returns a set of all possible sequences matching a
    sequence's degenerate nucleotides.

    Example
    -------
    >>> explode("GTGYCAGCMGCCGCGGTAA")
    {'GTGCCAGCAGCCGCGGTAA',
     'GTGCCAGCCGCCGCGGTAA',
     'GTGTCAGCAGCCGCGGTAA',
     'GTGTCAGCCGCCGCGGTAA'}
    >>> explode("WWW")
    {'AAA', 'AAT', 'ATA', 'ATT', 'TAA', 'TAT', 'TTA', 'TTT'}
    """
    primers = set()
    vp = get_all_position_variations(primer)
    for variant_set in itertools.product(*vp):
        p = primer[:]
        for (vi, vn) in variant_set:
            # python doesn't support in-place string updates so we gotta do this jank
            p = p[:vi] + vn + p[vi + 1:]
        primers.add(p)
    return primers
	# This code makes it easy to do exact matching of a primer sequence to e.g.
	# a 16S rRNA gene sequence -- you can use explode() on the primer sequence
	# to get all possible variations of this primer sequence, and then just
	# search through your gene to find exact matches to any of these primers.
	#
	# I know there are real tools that can do this faster (e.g. Primer Prospector),
	# but I just needed something quick and dirty.

	# Some example sequences with degenerate nucleotides: the EMP 16S primers,
	# per https://earthmicrobiome.org/protocols-and-standards/16s/
	fwdprimer = "GTGYCAGCMGCCGCGGTAA"
	revprimer = "GGACTACNVGGGTWTCTAAT"

	def get_poss_nts(degen):
	"""Returns all possible A/C/G/T nucleotides represented by a degenerate nucleotide.

	References
	----------
	Based on https://www.bioinformatics.org/sms/iupac.html, as of April 2025.
	"""
	if degen == "N":
	# we check N first because it is the most common of these in practice
	return "ACGT"
	elif degen == "R":
	return "AG"
	elif degen == "Y":
	return "CT"
	elif degen == "S":
	return "GC"
	elif degen == "W":
	return "AT"
	elif degen == "K":
	return "GT"
	elif degen == "M":
	return "AC"
	elif degen == "B":
	return "CGT"
	elif degen == "D":
	return "AGT"
	elif degen == "H":
	return "ACT"
	elif degen == "V":
	return "ACG"
	else:
	raise ValueError(f"Character {degen} not in IUPAC code?")

	def get_degens(primer):
	"""Returns a list of (0-indexed position, degenerate nucleotide) pairs.

	Example
	-------
	>>> get_degens("GTGYCAGCMGCCGCGGTAA")
	[(3, 'Y'), (8, 'M')]
	"""
	degens = []
	for i, c in enumerate(primer):
	if c not in "ACGT":
	degens.append((i, c))
	return degens

	def get_all_position_variations(primer):
	"""Returns a list of lists describing all possible nucleotides at
	all positions with degenerate nucleotides in a sequence.

	Example
	-------
	>>> get_all_position_variations("GTGYCAGCMGCCGCGGTAA")
	[[(3, 'C'), (3, 'T')], [(8, 'A'), (8, 'C')]]
	"""
	degens = get_degens(primer)
	variant_positions = []
	for (dpos, dnt) in degens:
	poss = get_poss_nts(dnt)
	possibilities = [(dpos, n) for n in poss]
	variant_positions.append(possibilities)
	return variant_positions

	def explode(primer):
	"""Returns a set of all possible sequences matching a
	sequence's degenerate nucleotides.

	Example
	-------
	>>> explode("GTGYCAGCMGCCGCGGTAA")
	{'GTGCCAGCAGCCGCGGTAA',
	'GTGCCAGCCGCCGCGGTAA',
	'GTGTCAGCAGCCGCGGTAA',
	'GTGTCAGCCGCCGCGGTAA'}
	>>> explode("WWW")
	{'AAA', 'AAT', 'ATA', 'ATT', 'TAA', 'TAT', 'TTA', 'TTT'}
	"""
	primers = set()
	vp = get_all_position_variations(primer)
	for variant_set in itertools.product(*vp):
	p = primer[:]
	for (vi, vn) in variant_set:
	# python doesn't support in-place string updates so we gotta do this jank
	p = p[:vi] + vn + p[vi + 1:]
	primers.add(p)
	return primers
No results found