Skip to content

Instantly share code, notes, and snippets.

@etal
Created July 20, 2016 19:11
Show Gist options
  • Select an option

  • Save etal/74440a849e273a5242fa1dd92377277c to your computer and use it in GitHub Desktop.

Select an option

Save etal/74440a849e273a5242fa1dd92377277c to your computer and use it in GitHub Desktop.
Combine nearby SNVs into a single MNV
"""Find adjacent SNPs/MNPs in a VCF, and look in the BAM file to see if these
should be joined into a larger variant.
This script can only handle up to 2 internal compound ALT alleles for every
stretch of adjacent SNPs. This script is not designed to test all possible
combinations of internal joined SNPs for SNP stretches longer than 3.
This script also cannot handle multiallelic variants or indels.
"ALERT" will be added to VCF INFO columns for cases that are too complex.
"""
from __future__ import division
import copy
import operator
import subprocess
import sys
# We're splitting multiallelics now so should not find any multiallelic VCF lines anymore.
# If the adjacent variants contain some multiallelics, should add alert.
# If the adjacent variants are only multiallelics, should not add an alert.
def checkMultiallelic(variants):
seenPos = []
countSplit = 0
countComma = 0
for var in variants:
if not [var[0], var[1]] in seenPos:
seenPos.append([var[0], var[1]])
if 'SPLITMULTIALLELIC' in var[7]:
countSplit += 1
elif ',' in var[4]:
countComma += 1
if countSplit == len(variants) and len(seenPos) == 1:
# This set of variants came from a single original multiallelic variant. don't add ALERT to this set.
multiallelic = 'All'
elif countSplit > 0:
# Not everything in this set came from a single original multiallelic variant.
multiallelic = 'Some'
elif countComma > 0:
multiallelic = 'Some'
else:
multiallelic = 'None'
return multiallelic
def checkIndels(variants):
indelsPresent = False
for var in variants:
for alt in var[4].split(','):
if len(var[3]) != len(alt):
indelsPresent = True
break
return indelsPresent
def getReference(chrom, start, end, ref2bit):
# For twoBit2Fa, coords need to be 0-based, left closed, right open.
fixStart = int(start)-1
fixEnd = int(end)-1
coords = "{}:{}-{}".format(chrom, fixStart, fixEnd)
step_get_ref = subprocess.Popen(['twoBitToFa', '-noMask', ref2bit + ":" + coords, 'stdout'],
stdout=subprocess.PIPE)
step_remove_header = subprocess.Popen(['grep', '-v', '>'],
stdin=step_get_ref.stdout,
stdout=subprocess.PIPE)
step_get_ref.stdout.close()
# Stuff from twoBitToFa should only ever be a few bases, since we are
# looking for things that are "adjacent" so there shouldn't be multiple
# lines in output, but set this up so that multiple lines won't break
# things.
ref = "".join(step_remove_header.communicate()[0].rstrip('\n\r').splitlines())
print >> sys.stderr, "Intervening genomic sequence: %s" % ref
return ref
def getJoinedAlleles(variants, ref2bit):
# Initialize joined alleles
refJoined = variants[0][3]
altJoined = variants[0][4]
for i in range(1, len(variants)):
# Is there space between this and the previous variant?
# If there is, add it to the joined alleles.
currChrom = variants[i][0]
currPos = int(variants[i][1])
prevPos = int(variants[i-1][1])
prevLength = len(variants[i-1][3])
prevEnd = prevPos + prevLength - 1
if currPos - prevEnd > 1:
intervening_genomic_seq = getReference(currChrom, prevEnd+1,
currPos, ref2bit)
refJoined += intervening_genomic_seq
altJoined += intervening_genomic_seq
refJoined += variants[i][3]
altJoined += variants[i][4]
return refJoined, altJoined
def correctVcfLine(variants, newRef, newAlt):
if len(variants) > 1:
correctedLine = copy.copy(variants[0])
correctedLine[3] = newRef
correctedLine[4] = newAlt
correctedLine[7] = correctedLine[7].replace('SNP', 'MNP').replace('snp', 'mnp') + ';JOINED'
else:
correctedLine = variants[0]
return correctedLine
def alertSet(variants):
alertVars = []
for var in variants:
alertLine = copy.copy(var)
alertLine[7] += ";ALERT"
alertVars.append("\t".join(map(str, alertLine)))
return alertVars
def examineAdjacent(adjacentVar, inBam, ref2bit):
# Minimum number of reads to support an allele
min_support = 5
print >> sys.stderr, "Examining adjacent variants:\n%s" % "\n".join(map(str, adjacentVar))
correctedVars = []
if len(adjacentVar) > 3:
print >> sys.stderr, "WARNING: This is a stretch of %s adjacent variants. This script is not designed to test all possible combinations of internal joined SNPs for stretches longer than 3." % len(adjacentVar)
if checkIndels(adjacentVar) is True:
print >> sys.stderr, "WARNING: Variant stretch includes indels."
correctedVars = alertSet(adjacentVar)
if checkMultiallelic(adjacentVar) == 'Some':
print >> sys.stderr, "WARNING: Variant stretch includes multiallelic variants."
correctedVars = alertSet(adjacentVar)
elif checkMultiallelic(adjacentVar) == 'All':
print >> sys.stderr, "WARNING: This variant set comes from a single original multiallelic variant and does not represent true adjacent SNPs/MNPs. Leaving variants as is."
# correctedVars might have already been set if there are indels, so need to reset.
correctedVars = []
for var in adjacentVar:
correctedVars.append("\t".join(map(str, var)))
if len(correctedVars) == 0:
refJoined, altJoined = getJoinedAlleles(adjacentVar, ref2bit)
print >> sys.stderr, "Joined REF allele is %s" % refJoined
print >> sys.stderr, "Joined ALT allele is %s" % altJoined
# coordinates of joined region
chrom = adjacentVar[0][0]
start = int(adjacentVar[0][1])
end = int(adjacentVar[-1][1]) + len(adjacentVar[-1][4]) - 1
coords = chrom + ":" + str(start) + "-" + str(end)
# get counts of alleles from BAM file.
observed_alleles = {}
print >> sys.stderr, "Samtools view -f 2 -F 1792 %s" % coords
# don't count duplicates.
step_samtools = subprocess.Popen(['samtools', 'view', '-f', '2', '-F', '1792', inBam, coords], stdout=subprocess.PIPE)
step_cut = subprocess.Popen(['cut', '-f', '4,6,10'], stdin=step_samtools.stdout, stdout=subprocess.PIPE)
step_samtools.stdout.close()
bamData=step_cut.communicate()[0].rstrip('\n\r').split('\n')
if len(bamData[0]) > 0:
print >> sys.stderr, "Total reads at this position: %s" % len(bamData)
fullMatchCount = 0
spanFullLocusCount = 0
for read in iter(bamData):
pos, cigar, seq = read.split("\t")
fullMatchCigar = str(len(seq)) + "M"
if cigar == fullMatchCigar:
fullMatchCount += 1
# VCF is 1-based but python is 0-based
left = start - int(pos)
right = end - int(pos) + 1
allele = seq[left:right]
if len(allele) == len(altJoined):
spanFullLocusCount += 1
if allele in observed_alleles:
observed_alleles[allele] += 1
else:
observed_alleles[allele] = 1
print >> sys.stderr, "Reads with full match CIGAR (%s): %s" % (fullMatchCigar, fullMatchCount)
print >> sys.stderr, "Reads with full match CIGAR that span the full locus: %s (%.3f%% of all reads)" % (spanFullLocusCount, 100.0*spanFullLocusCount/len(bamData))
# Sort alleles by abundance. This produces a list of duples. For every duple, the first element is the allele
# and the second is the count.
observed_alleles_sorted = sorted(observed_alleles.items(), key=operator.itemgetter(1), reverse=True)
# Require at least 40% usable reads
if (len(observed_alleles.keys()) >= 1
and observed_alleles_sorted[0][1] >= min_support
and 1.0*spanFullLocusCount/len(bamData) >= 0.4):
# For now, script handles at most 2 major alleles
major_alleles = [observed_alleles_sorted[0][0]]
# major_allele_read_counts = int(observed_alleles_sorted[0][1])
if (len(observed_alleles_sorted) > 1
and (observed_alleles_sorted[1][1] >= min_support
or observed_alleles_sorted[1][1] >= 0.6*observed_alleles_sorted[0][1])):
major_alleles.append(observed_alleles_sorted[1][0])
# major_allele_read_counts += int(observed_alleles_sorted[1][1])
# Convert each major allele into a binary array showing wheter it
# matches REF or ALT at each variant position
abBinary = []
for abAllele in major_alleles:
convBin = []
for k in range(len(refJoined)):
if refJoined[k:k+1] == altJoined[k:k+1]:
# This is some intervening genomic sequence between the adjacent SNPs.
continue
elif abAllele[k:k+1] == refJoined[k:k+1]:
convBin.append(0)
else:
convBin.append(1)
abBinary.append(convBin)
print >> sys.stderr, "Major alleles: %s" % major_alleles
print >> sys.stderr, "Major alleles binary arrays: %s" % abBinary
# How many reads aren't from the major allele(s)/the reference allele?
# Only look at variant positions, not intervening genomic sequence
nonref_minor_allele_read_counts = 0
for y in range(len(major_alleles), len(observed_alleles_sorted)):
currAllele = observed_alleles_sorted[y][0]
nonRef = 0
for k in range(len(refJoined)):
if refJoined[k:k+1] == altJoined[k:k+1]:
# This is some intervening genomic sequence between the adjacent SNPs.
continue
elif refJoined[k:k+1] == currAllele[k:k+1]:
continue
else:
nonRef+=1
if nonRef > 0:
nonref_minor_allele_read_counts += int(observed_alleles_sorted[y][1])
nonref_minor_allele_percent = 100*float(nonref_minor_allele_read_counts)/float(spanFullLocusCount)
for obs in observed_alleles_sorted:
print >> sys.stderr, "Found allele %s, count %s" % (obs[0], obs[1])
print >> sys.stderr, "Reads from non-reference minor alleles are %.3f%% of total" % nonref_minor_allele_percent
if nonref_minor_allele_percent <= 5:
if ((len(major_alleles) == 2 and altJoined in major_alleles and refJoined in major_alleles)
or (len(major_alleles) == 1 and altJoined in major_alleles)
or (len(major_alleles) == 2 and [0]*len(adjacentVar) in abBinary and [1]*len(adjacentVar) in abBinary)
or (len(major_alleles) == 1 and [1]*len(adjacentVar) in abBinary)):
# Join the variants
print >> sys.stderr, "Joining variants:\n%s" % "\n".join(map(str, adjacentVar))
joinedVars = correctVcfLine(adjacentVar, refJoined, altJoined)
correctedVars.append("\t".join(map(str, joinedVars)))
else:
if len(adjacentVar) == 2 or len(major_alleles) == 1:
print >> sys.stderr, "Data does not support joining variants. Leaving them separate"
for origVar in adjacentVar:
correctedVars.append("\t".join(map(str, origVar)))
else:
# Convert major allele sequence to 0,1 for ref/alt.
print >> sys.stderr, "Full combined variant %s not found. Looking at internal subsets" % altJoined
# Check that at every position, one allele is REF and the other is ALT.
# Sum of abBinary[0][q] and abBinary[1][q] should be 1 for all q.
if all(abbin0 + abbin1 == 1
for abbin0, abbin1 in zip(abBinary[0], abBinary[1])):
# Sort orig SNPs into two sets to join.
first_set_to_join = []
second_set_to_join = []
# This puts the allele with the ALT SNP at first position first in list.
# Use this to sort orig SNPs into alleles.
abBinary.sort(reverse=True)
alleleMap = abBinary[0]
for z, amz in enumerate(alleleMap):
if amz == 1:
first_set_to_join.append(adjacentVar[z])
else:
second_set_to_join.append(adjacentVar[z])
print >> sys.stderr, "Joining variants."
print >> sys.stderr, "Set1:\n%s" % "\n".join(map(str, first_set_to_join))
refJoinedSet1, altJoinedSet1 = getJoinedAlleles(first_set_to_join, ref2bit)
joinedVarsSet1 = correctVcfLine(first_set_to_join, refJoinedSet1, altJoinedSet1)
print >> sys.stderr, "Set2:\n%s" % "\n".join(map(str, second_set_to_join))
refJoinedSet2, altJoinedSet2 = getJoinedAlleles(second_set_to_join, ref2bit)
joinedVarsSet2 = correctVcfLine(second_set_to_join, refJoinedSet2, altJoinedSet2)
correctedVars.append("\t".join(map(str, joinedVarsSet1)))
correctedVars.append("\t".join(map(str, joinedVarsSet2)))
else:
# this case might happen if one or more (but not all) SNPs is homozygous ALT, in which case that SNP would segregate with all other variants.
print >> sys.stderr, "WARNING: Two major alleles are not mutually exclusive. This may be a multiallelic locus."
correctedVars = alertSet(adjacentVar)
else:
if len(adjacentVar) == 2:
print >> sys.stderr, "Data does not support joining variants. Leaving them separate"
for origVar in adjacentVar:
correctedVars.append("\t".join(map(str, origVar)))
else:
print >> sys.stderr, "WARNING: There may be more than two internal subsets of variants."
correctedVars = alertSet(adjacentVar)
else:
print >> sys.stderr, "WARNING: Not enough reads to process variants. Poor mapping region."
# consider: don't print ALERT here? If there aren't enough reads this is just a bad mapping region, and looking in the BAM file won't help.
correctedVars = alertSet(adjacentVar)
return correctedVars
def peek_line(f):
currplace = f.tell()
nextline = f.readline()
f.seek(currplace)
return nextline
def main(distance, inFile, inBam, ref2bit, outFile):
adjacentVar = []
for line in inFile:
line = line.rstrip('\n\r')
if line.startswith("##INFO=<ID=VT,"):
# Rewrite the VT header tag
print >> outFile, '##INFO=<ID=VT,Number=1,Type=String,Description="Variant type, can be SNP, MNP, INS or DEL">'
elif line.startswith("##"):
# Copy the rest of the VCF header
print >> outFile, line
elif line.startswith("#"):
# Add custom tags to the end of the VCF header, just before #CHROM...
print >> outFile, '##joinAdjacentSNPs_command=%s' % " ".join(map(str, sys.argv))
print >> outFile, '##INFO=<ID=JOINED,Number=0,Type=Flag,Description="Variant is the result of joining adjacent SNPs/MNPs. Please use caution when interpreting sample-specific values.">'
print >> outFile, '##INFO=<ID=ALERT,Number=0,Type=Flag,Description="Variant is adjacent to other variants, but case was too complex for joinAdjacentSNPs to handle. Please review.">'
print >> outFile, line
else:
fields = line.split('\t')
chrom = fields[0]
pos = int(fields[1])
end = pos + len(fields[3]) - 1
# Check the next line to see if the variant is adjacent.
nextLine = peek_line(inFile)
if nextLine:
nextFields = nextLine.rstrip('\n\r').split('\t')
nextChrom = nextFields[0]
nextPos = int(nextFields[1])
if chrom == nextChrom and end >= nextPos - distance:
# This is adjacent to the next variant
if len(adjacentVar) == 0:
print >> sys.stderr, "----- Found adjacent variants -----"
adjacentVar.append(fields)
adjacentVar.append(nextFields)
else:
if len(adjacentVar) == 0:
print >> outFile, line
else:
# Deal with adjacent variants
correctedVars = examineAdjacent(adjacentVar, inBam, ref2bit)
print >> outFile, "\n".join(map(str, correctedVars))
# Reset
adjacentVar = []
else:
# EOF
if len(adjacentVar) == 0:
print >> outFile, line
else:
# Deal with adjacent variants
correctedVars = examineAdjacent(adjacentVar, inBam, ref2bit)
print >> outFile, "\n".join(map(str, correctedVars))
if __name__ == '__main__':
import argparse
AP = argparse.ArgumentParser(description=__doc__)
AP.add_argument('distance', type=int,
help="""Distance in bp between SNVs/MNVs to be considered
'adjacent'. In other words, POS2-POS1 must be <= [distance]
for the SNPs to be adjacent.""")
AP.add_argument('bam',
help="""BAM file of mapped reads from which the VCF calls
were made.""")
AP.add_argument('ref2bit',
help="Reference genome in UCSC 2bit format.")
AP.add_argument('-v', '--vcf',
type=argparse.FileType('r'), default=sys.stdin,
help="Variants in VCF format, sorted by genomic position.")
AP.add_argument('-o', '--output',
type=argparse.FileType('w'), default=sys.stdout,
help="Output VCF filename. [Default: standard output]")
args = AP.parse_args()
main(args.distance, args.vcf, args.bam, args.ref2bit, args.output)
@bwubb
Copy link

bwubb commented Sep 8, 2016

I am not sure how you are able to run this. I receive errors for mixing up f.next(), which is used in the 'for loop' file iterator, and the f.readline() used in peek_line(). If I changed the readline() to a next(), tell() and seek() were not behaving correctly; for which I have no explanation.

To overcome all of this I changed line 301/302

while True:
    line = inFile.readline()

and inserted
break
right after line 345.

This allowed me to keep f.readline() in peek_line.

@r-keshar
Copy link

Hi Eric,
Thank you for making such a nice script and it works well in most cases but something i really don't understand very well as i m not very good at python. Can you please explain the two cases which seems similar case but one have fixed SNP by the script and another does not.

###################

Case 1. Not worked

##################
----- Found adjacent variants -----
Examining adjacent variants:
['chr19', '8909477', 'rs75217292', 'C', 'T', '.', 'PASS', 'ECNT=2;HCNT=27;MAX_ED=1;MIN_ED=1;NLOD=148.70;TLOD=999.51;DP_HQ=945;ALT_AF=51.0;CALLER=GATK', 'GT:AD:AF:ALT_F1R2:ALT_F2R1:FOXOG:QSS:REF_F1R2:REF_F2R1', '0/1:464,481:0.500:248:233:0.484:13151,15155:235:229\n']

['chr19', '8909478', 'rs11882881', 'A', 'G', '.', 'PASS', 'ECNT=2;HCNT=56;MAX_ED=1;MIN_ED=1;NLOD=147.07;TLOD=2614.67;DP_HQ=933;ALT_AF=100.0;CALLER=GATK', 'GT:AD:AF:ALT_F1R2:ALT_F2R1:FOXOG:QSS:REF_F1R2:REF_F2R1', '0/1:1,932:1.00:478:454:0.487:4,28683:0:1']

Joined REF allele is CA
Joined ALT allele is TG
Samtools view -f 2 -F 1792 chr19:8909477-8909478
Total reads at this position: 1072
Reads with full match CIGAR (147M): 859
Reads with full match CIGAR that span the full locus: 845 (78.825% of all reads)
Major alleles: ['CG', 'TG']
Major alleles binary arrays: [[0, 1], [1, 1]]

Found allele CG, count 423
Found allele TG, count 421
Found allele CA, count 1
Reads from non-reference minor alleles are 0.000% of total
Data does not support joining variants. Leaving them separate

############################

Case 1. Worked in similar case

############################

----- Found adjacent variants -----
Examining adjacent variants:
['chr6', '29945414', 'rs2975055', 'T', 'C', '.', 'alt_allele_in_normal;clustered_events', 'ECNT=6;HCNT=24;MAX_ED=94;MIN_ED=37;NLOD=18.02;TLOD=95.79;DP_HQ=328;ALT_AF=10.0;CALLER=GATK', 'GT:AD:AF:ALT_F1R2:ALT_F2R1:FOXOG:PGT:PID:QSS:REF_F1R2:REF_F2R1', '0/1:295,33:0.126:20:13:0.606:0|1:29945394_G_A:9121,1002:143:152\n']

['chr6', '29945415', 'rs9260210', 'G', 'A', '.', 'alt_allele_in_normal;clustered_events', 'ECNT=6;HCNT=24;MAX_ED=94;MIN_ED=37;NLOD=17.32;TLOD=93.49;DP_HQ=328;ALT_AF=10.0;CALLER=GATK', 'GT:AD:AF:ALT_F1R2:ALT_F2R1:FOXOG:PGT:PID:QSS:REF_F1R2:REF_F2R1', '0/1:295,33:0.067:20:13:0.606:0|1:29945394_G_A:9634,1049:143:152']

Joined REF allele is TG
Joined ALT allele is CA
Samtools view -f 2 -F 1792 chr6:29945414-29945415
Total reads at this position: 416
Reads with full match CIGAR (151M): 286
Reads with full match CIGAR that span the full locus: 278 (66.827% of all reads)

Major alleles: ['TG', 'CA']
Major alleles binary arrays: [[0, 0], [1, 1]]
Found allele TG, count 252
Found allele CA, count 26
Reads from non-reference minor alleles are 0.000% of total

Joining variants:
['chr6', '29945414', 'rs2975055', 'T', 'C', '.', 'alt_allele_in_normal;clustered_events', 'ECNT=6;HCNT=24;MAX_ED=94;MIN_ED=37;NLOD=18.02;TLOD=95.79;DP_HQ=328;ALT_AF=10.0;CALLER=GATK', 'GT:AD:AF:ALT_F1R2:ALT_F2R1:FOXOG:PGT:PID:QSS:REF_F1R2:REF_F2R1', '0/1:295,33:0.126:20:13:0.606:0|1:29945394_G_A:9121,1002:143:152\n']
['chr6', '29945415', 'rs9260210', 'G', 'A', '.', 'alt_allele_in_normal;clustered_events', 'ECNT=6;HCNT=24;MAX_ED=94;MIN_ED=37;NLOD=17.32;TLOD=93.49;DP_HQ=328;ALT_AF=10.0;CALLER=GATK', 'GT:AD:AF:ALT_F1R2:ALT_F2R1:FOXOG:PGT:PID:QSS:REF_F1R2:REF_F2R1', '0/1:295,33:0.067:20:13:0.606:0|1:29945394_G_A:9634,1049:143:152']

Regards,
r-keshar

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment