Skip to content

Instantly share code, notes, and snippets.

@cory-weller
Created May 25, 2023 16:22
Show Gist options
  • Select an option

  • Save cory-weller/79407fd24a3a0fb692f4acb6f71d184f to your computer and use it in GitHub Desktop.

Select an option

Save cory-weller/79407fd24a3a0fb692f4acb6f71d184f to your computer and use it in GitHub Desktop.
Reads through fastq, assigns reads to fasta database, and outputs text files of reads assigned to each fasta sequence.
#!/usr/bin/env bash
echo "Running ampliconsplit.sh $@"
## convenience functions for arg parsing
usage_error () { echo >&2 "ERROR: $1"; exit 2; }
assert_argument () { test "$1" != "$EOL" || usage_error "$2 requires an argument. Try --help"; }
# Parse Arguments
## import $@
if [ "$#" != 0 ]; then
EOL=$(printf '\1\3\3\7')
set -- "$@" "$EOL"
while [ "$1" != "$EOL" ]; do
opt="$1"; shift
case "$opt" in
# Your options go here.
-h|--help) HELP='True';;
--clobber) CLOBBER='True';;
--fa|--fasta) assert_argument "$1" "$opt"; FASTA="$1"; shift;;
--fq|--fastq) assert_argument "$1" "$opt"; FASTQ="$1"; shift;;
--fq2|--fastq2) assert_argument "$1" "$opt"; FASTQ2="$1"; shift;;
# Arguments processing. You may remove any unneeded line after the 1st.
-|''|[!-]*) set -- "$@" "$opt";; # positional argument, rotate to the end
--*=*) set -- "${opt%%=*}" "${opt#*=}" "$@";; # convert '--name=arg' to '--name' 'arg'
-[!-]?*) set -- $(echo "${opt#-}" | sed 's/\(.\)/ -\1/g') "$@";; # convert '-abc' to '-a' '-b' '-c'
--) while [ "$1" != "$EOL" ]; do set -- "$@" "$1"; shift; done;; # process remaining arguments as positional
-*) usage_error "unknown option: '$opt'. Try --help";; # catch misspelled options
*) usage_error "this should NEVER happen ($opt)";; # sanity test for previous patterns
esac
done
shift # $EOL
fi
BADARGS=0
USAGE="Usage: ampliconsplit.sh --fasta <filename.fasta> --fastq <filename.fastq>"
## Print help message if requested
if [ "${HELP}" == 'True' ]; then
cat << EndOfHelp
${USAGE}
--help | -h Print this message and exit.
--fasta | --fa Fasta file input.
--fastq | --fq Single end fastq file (or first read of a pair).
--fastq2 | --fq2 Second fastq file, if paired-end reads.
--clobber Delete all intermediate/final output and regenerate.
EndOfHelp
exit 0
fi
## Check for required arguments ######################################
if [ "${FASTA}" == '' ]; then echo "ERROR: --fasta is required"; let BADARGS=${BADARGS}+1; fi
if [ "${FASTQ}" == '' ]; then echo "ERROR: --fastq is required"; let BADARGS=${BADARGS}+1; fi
if [ "${BADARGS}" -gt 0 ]; then echo "Exiting due to ${BADARGS} errors"; exit 1; fi
## Check required argument format ####################################
if [ "${FASTA}" == '' ]; then echo "ERROR: --fasta is required"; let BADARGS=${BADARGS}+1; fi
## Check if module command exists
if command -v module &> /dev/null; then
MODULE='True'
else
MODULE='False'
fi
## Check if dependancies exist
DEPENDS_ON=( 'samtools' 'bwa' )
UNAVAIL=()
for dependancy in ${DEPENDS_ON[@]}; do
if ! command -v ${dependancy} &> /dev/null; then
UNAVAIL+=(${dependancy})
fi
done
## Check if dependancies can be loaded by module
if [ "${MODULE}" == 'True' ]; then
for dependancy in ${UNAVAIL[@]}; do
if $(module avail "${dependancy}" 2>&1 >/dev/null | grep -q 'No module'); then
echo "ERROR: ${dependancy} cannot be found!"
exit 1
else
module load ${dependancy}
fi
done
fi
## RUN ###############################################################
# BWA indexing
if [ ! -f "${FASTA}.bwt" ] || [ "${CLOBBER}" == 'True' ] ; then
bwa index ${FASTA}
else
echo "INFO: ${FASTA} bwa indices already exist, skipping indexing"
echo "INFO: Re-run with --clobber to regenerate indices"
fi
# BWA alignment
if [ ! -f 'ALN.bam' ] || [ "${CLOBBER}" == 'True' ]; then
bwa mem ${FASTA} ${FASTQ} ${FASTQ2} | samtools view -b | \
samtools sort > ALN.bam
samtools index ALN.bam
else
echo "INFO: alignment ALN.bam already exists, skipping"
echo "INFO: Re-run with --clobber to regenerate alignment"
fi
# Split into different bam per sequence
SEQS=($(grep '^>' ${FASTA} | tr -d '>'))
for SEQ in ${SEQS[@]}; do
samtools view -b ALN.bam ${SEQ} > ${SEQ}.bam
samtools index ${SEQ}.bam
samtools fastq ${SEQ}.bam > ${SEQ}.fastq
awk '(NR-1)%4==1' ${SEQ}.fastq > ${SEQ}.txt
rm ${SEQ}.bam ${SEQ}.bam.bai ${SEQ}.fastq
done
# Do unmapped reads
samtools view -b -f 4 ALN.bam > unmapped.bam
samtools index unmapped.bam
samtools fastq unmapped.bam > unmapped.fastq
awk '(NR-1)%4==1' unmapped.fastq | grep -v 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN' > unmapped.txt
rm unmapped.bam unmapped.bam.bai unmapped.fastq ALN.bam
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment