Skip to content

Instantly share code, notes, and snippets.

@steveandroulakis
Created September 22, 2016 01:54
Show Gist options
  • Select an option

  • Save steveandroulakis/b94b0dea225e623f42f9c5f0da9d8bbb to your computer and use it in GitHub Desktop.

Select an option

Save steveandroulakis/b94b0dea225e623f42f9c5f0da9d8bbb to your computer and use it in GitHub Desktop.
really rough bam processing pipeline
#!/usr/bin/python
import sys, getopt
import glob
import os
import ntpath
from time import strftime
print '******bam_pipe******'
print strftime("%Y-%m-%d %H:%M:%S")
print '********************'
print '\n'
def sub_index_bam(bam_file):
import subprocess
process = subprocess.Popen(('samtools index %s') % \
(bam_file),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True)
# todo handle non zero
process.wait()
for line in process.stdout:
print line
for line in process.stderr:
print line
#n = 10
#source = '/mnt/ceph/mbp/servers/bioinformatics-platform/home/steve/projects/aspree_bam_pipe/source'
#dest = '/mnt/ceph/mbp/servers/bioinformatics-platform/home/steve/projects/aspree_bam_pipe/dest'
def bam_pipe(source, dest, n):
source_path = source
if not source.startswith(os.path.sep):
#relative
source_path = os.path.join(os.getcwd(), source)
print 'source path is: %s' % source
print 'dest path is: %s' % dest
if not n == None:
print 'files to process in source (up to): %s' % n
print '\n'
skipped = 0
processed = 0
count = 0
for name in glob.glob('%s/*.bam' % source_path):
if n == None:
pass
elif count == int(n):
print 'ENDING run because file limit count reached: %s' % count
break
basename = ntpath.basename(name)
print '*** bam file: %s ***' % basename
dest_path = os.path.join(dest, basename)
if not dest.startswith(os.path.sep):
#relative
dest_path = os.path.join(os.getcwd(), dest_path)
source_file = os.path.join(source_path, basename)
#print source_file
#print dest_path
if os.path.exists(dest_path):
skipped = skipped + 1
print 'SKIPPING processing of file because it exists in destination'
else:
processed = processed + 1
print 'LINKING source file in destination location for processing'
os.symlink(source_file, dest_path)
print 'PROCESSING bam file (creating INDEX)'
sub_index_bam(dest_path)
count = count + 1
print '\n'
print '\nDone. Files Processed: %s. Files Skipped: %s.' % (processed, skipped)
return
def main(argv):
inputfile = ''
outputfile = ''
number = None
fail = False
try:
opts, args = getopt.getopt(argv,"hi:o:n:",["ifile=","ofile=","number="])
except getopt.GetoptError:
print 'test.py -i <inputfile> -o <outputfile> | tee -a logfile.txt)'
print 'optional argument eg. -n 10 (to limit to 10 files)'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'test.py -i <inputfile> -o <outputfile> (optional: -n <number of files> | tee -a logfile.txt)'
print 'optional argument eg. -n 10 (to limit to 10 files)'
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
elif opt in ("-n", "--number"):
number = arg
if not inputfile:
print 'need input directory parameter (-i)'
fail = True
if not outputfile:
print 'need output directory parameter (-o)'
fail = True
if not fail:
bam_pipe(inputfile, outputfile, number)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment