Created
May 25, 2023 16:22
-
-
Save cory-weller/79407fd24a3a0fb692f4acb6f71d184f to your computer and use it in GitHub Desktop.
Reads through fastq, assigns reads to fasta database, and outputs text files of reads assigned to each fasta sequence.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| echo "Running ampliconsplit.sh $@" | |
| ## convenience functions for arg parsing | |
| usage_error () { echo >&2 "ERROR: $1"; exit 2; } | |
| assert_argument () { test "$1" != "$EOL" || usage_error "$2 requires an argument. Try --help"; } | |
| # Parse Arguments | |
| ## import $@ | |
| if [ "$#" != 0 ]; then | |
| EOL=$(printf '\1\3\3\7') | |
| set -- "$@" "$EOL" | |
| while [ "$1" != "$EOL" ]; do | |
| opt="$1"; shift | |
| case "$opt" in | |
| # Your options go here. | |
| -h|--help) HELP='True';; | |
| --clobber) CLOBBER='True';; | |
| --fa|--fasta) assert_argument "$1" "$opt"; FASTA="$1"; shift;; | |
| --fq|--fastq) assert_argument "$1" "$opt"; FASTQ="$1"; shift;; | |
| --fq2|--fastq2) assert_argument "$1" "$opt"; FASTQ2="$1"; shift;; | |
| # Arguments processing. You may remove any unneeded line after the 1st. | |
| -|''|[!-]*) set -- "$@" "$opt";; # positional argument, rotate to the end | |
| --*=*) set -- "${opt%%=*}" "${opt#*=}" "$@";; # convert '--name=arg' to '--name' 'arg' | |
| -[!-]?*) set -- $(echo "${opt#-}" | sed 's/\(.\)/ -\1/g') "$@";; # convert '-abc' to '-a' '-b' '-c' | |
| --) while [ "$1" != "$EOL" ]; do set -- "$@" "$1"; shift; done;; # process remaining arguments as positional | |
| -*) usage_error "unknown option: '$opt'. Try --help";; # catch misspelled options | |
| *) usage_error "this should NEVER happen ($opt)";; # sanity test for previous patterns | |
| esac | |
| done | |
| shift # $EOL | |
| fi | |
| BADARGS=0 | |
| USAGE="Usage: ampliconsplit.sh --fasta <filename.fasta> --fastq <filename.fastq>" | |
| ## Print help message if requested | |
| if [ "${HELP}" == 'True' ]; then | |
| cat << EndOfHelp | |
| ${USAGE} | |
| --help | -h Print this message and exit. | |
| --fasta | --fa Fasta file input. | |
| --fastq | --fq Single end fastq file (or first read of a pair). | |
| --fastq2 | --fq2 Second fastq file, if paired-end reads. | |
| --clobber Delete all intermediate/final output and regenerate. | |
| EndOfHelp | |
| exit 0 | |
| fi | |
| ## Check for required arguments ###################################### | |
| if [ "${FASTA}" == '' ]; then echo "ERROR: --fasta is required"; let BADARGS=${BADARGS}+1; fi | |
| if [ "${FASTQ}" == '' ]; then echo "ERROR: --fastq is required"; let BADARGS=${BADARGS}+1; fi | |
| if [ "${BADARGS}" -gt 0 ]; then echo "Exiting due to ${BADARGS} errors"; exit 1; fi | |
| ## Check required argument format #################################### | |
| if [ "${FASTA}" == '' ]; then echo "ERROR: --fasta is required"; let BADARGS=${BADARGS}+1; fi | |
| ## Check if module command exists | |
| if command -v module &> /dev/null; then | |
| MODULE='True' | |
| else | |
| MODULE='False' | |
| fi | |
| ## Check if dependancies exist | |
| DEPENDS_ON=( 'samtools' 'bwa' ) | |
| UNAVAIL=() | |
| for dependancy in ${DEPENDS_ON[@]}; do | |
| if ! command -v ${dependancy} &> /dev/null; then | |
| UNAVAIL+=(${dependancy}) | |
| fi | |
| done | |
| ## Check if dependancies can be loaded by module | |
| if [ "${MODULE}" == 'True' ]; then | |
| for dependancy in ${UNAVAIL[@]}; do | |
| if $(module avail "${dependancy}" 2>&1 >/dev/null | grep -q 'No module'); then | |
| echo "ERROR: ${dependancy} cannot be found!" | |
| exit 1 | |
| else | |
| module load ${dependancy} | |
| fi | |
| done | |
| fi | |
| ## RUN ############################################################### | |
| # BWA indexing | |
| if [ ! -f "${FASTA}.bwt" ] || [ "${CLOBBER}" == 'True' ] ; then | |
| bwa index ${FASTA} | |
| else | |
| echo "INFO: ${FASTA} bwa indices already exist, skipping indexing" | |
| echo "INFO: Re-run with --clobber to regenerate indices" | |
| fi | |
| # BWA alignment | |
| if [ ! -f 'ALN.bam' ] || [ "${CLOBBER}" == 'True' ]; then | |
| bwa mem ${FASTA} ${FASTQ} ${FASTQ2} | samtools view -b | \ | |
| samtools sort > ALN.bam | |
| samtools index ALN.bam | |
| else | |
| echo "INFO: alignment ALN.bam already exists, skipping" | |
| echo "INFO: Re-run with --clobber to regenerate alignment" | |
| fi | |
| # Split into different bam per sequence | |
| SEQS=($(grep '^>' ${FASTA} | tr -d '>')) | |
| for SEQ in ${SEQS[@]}; do | |
| samtools view -b ALN.bam ${SEQ} > ${SEQ}.bam | |
| samtools index ${SEQ}.bam | |
| samtools fastq ${SEQ}.bam > ${SEQ}.fastq | |
| awk '(NR-1)%4==1' ${SEQ}.fastq > ${SEQ}.txt | |
| rm ${SEQ}.bam ${SEQ}.bam.bai ${SEQ}.fastq | |
| done | |
| # Do unmapped reads | |
| samtools view -b -f 4 ALN.bam > unmapped.bam | |
| samtools index unmapped.bam | |
| samtools fastq unmapped.bam > unmapped.fastq | |
| awk '(NR-1)%4==1' unmapped.fastq | grep -v 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN' > unmapped.txt | |
| rm unmapped.bam unmapped.bam.bai unmapped.fastq ALN.bam |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment