Skip to content

Instantly share code, notes, and snippets.

@bede
Last active December 10, 2025 12:13
Show Gist options
  • Select an option

  • Save bede/89c90755dd29490f33047e52f97078ad to your computer and use it in GitHub Desktop.

Select an option

Save bede/89c90755dd29490f33047e52f97078ad to your computer and use it in GitHub Desktop.
Concatenate demultiplexed ONT FASTQs by barcode (for one or more runs)
"""
Purpose: Concatenate demultiplexed FASTQs by barcode for one or more ONT runs
Usage: python barcat.py run1/fastq_pass run2/fastq_pass -o output/
Author: Bede Constantinides
"""
import subprocess
import sys
import argparse
from collections import defaultdict
from pathlib import Path
from typing import List
def concatenate_fastqs_by_barcode(fastq_pass_dirs: List[Path], output_dir: Path):
barcodes_paths = defaultdict(list)
fastq_extensions = [".fastq", ".fastq.gz", ".fq", ".fq.gz"]
barcodes = [f"barcode{str(i).zfill(2)}" for i in range(1, 97)]
for barcode in barcodes:
for d in fastq_pass_dirs:
barcode_dir = d / barcode
if barcode_dir.exists():
for fastq_file in barcode_dir.iterdir():
if any(str(fastq_file).endswith(ext) for ext in fastq_extensions):
barcodes_paths[barcode].append(str(fastq_file))
output_dir.mkdir(parents=True, exist_ok=True)
for barcode, fastq_paths in barcodes_paths.items():
if fastq_paths:
output_path = output_dir / f"{barcode}.fastq.gz"
if output_path.exists():
output_path.unlink()
# Separate compressed and plain files
gz_files = [f for f in fastq_paths if f.endswith(('.fastq.gz', '.fq.gz'))]
plain_files = [f for f in fastq_paths if not f.endswith(('.fastq.gz', '.fq.gz'))]
# Use xargs to handle large numbers of files without hitting argument limit
with open(output_path, 'wb') as outfile:
if gz_files and plain_files:
# Mixed: decompress gz, cat plain, recompress
with subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=outfile) as p:
subprocess.run(['xargs', '-0', 'zcat'], input='\0'.join(gz_files).encode(), stdout=p.stdin, check=True)
subprocess.run(['xargs', '-0', 'cat'], input='\0'.join(plain_files).encode(), stdout=p.stdin, check=True)
p.stdin.close()
if p.returncode != 0:
raise subprocess.CalledProcessError(p.returncode, 'gzip')
elif gz_files:
# All compressed: concatenate directly (gzip streams are concatenable)
subprocess.run(['xargs', '-0', 'cat'], input='\0'.join(gz_files).encode(), stdout=outfile, check=True)
else:
# All plain: compress
with subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=outfile) as p:
subprocess.run(['xargs', '-0', 'cat'], input='\0'.join(plain_files).encode(), stdout=p.stdin, check=True)
p.stdin.close()
if p.returncode != 0:
raise subprocess.CalledProcessError(p.returncode, 'gzip')
print(f"Created {output_path} from {len(fastq_paths)} files", file=sys.stderr)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Concatenate demultiplexed FASTQs by barcode for one or more ONT runs. Output files are always compressed (.fastq.gz).')
parser.add_argument('fastq_pass_dirs', nargs='+', type=Path, help='Directories containing FASTQ files.')
parser.add_argument('-o', '--output-dir', type=Path, default=Path('.'), help='Output directory for concatenated FASTQ files.')
args = parser.parse_args()
concatenate_fastqs_by_barcode(args.fastq_pass_dirs, args.output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment