Skip to content

Instantly share code, notes, and snippets.

@etal
Created February 17, 2016 18:42
Show Gist options
  • Select an option

  • Save etal/79ba8448a7cf4412762d to your computer and use it in GitHub Desktop.

Select an option

Save etal/79ba8448a7cf4412762d to your computer and use it in GitHub Desktop.
Flatten a pooled CNVkit reference (hg19)
#!/usr/bin/env python
from __future__ import print_function
import sys
import numpy as np
import pandas as pd
import cnvlib
infname = sys.argv[1]
arr = cnvlib.read(infname)
len_orig = len(arr)
# Exclude HLA region (chr6:28866528-33775446)
d = arr.data
d_6 = d[d['chromosome'] == 'chr6']
d_hla = d_6[(d_6['start'] >= 28866528) & (d_6['end'] <= 33775446)]
if len(d_hla):
d = pd.concat([d.iloc[0:d_hla.index[0]],
d.iloc[d_hla.index[-1]+1:len(d)]])
# Drop low-coverage bins
d = d[d['log2'] > -5]
arr = arr.as_dataframe(d)
# Reset log2 normalized coverages to the flat expectation
arr['log2'] = pd.Series(arr.expect_flat_cvg(is_male_reference=True)
).astype('string')
# Formatting
arr['rmask'] = arr['rmask'].astype('string')
arr['spread'] = arr['spread'].astype('string')
# Output
base, ext = infname.rsplit('.', 1)
outfname = base + '-flat.' + ext
arr.write(outfname)
print("Dropped", len_orig - len(arr), "of", len(arr), "bins")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment