Skip to content

Instantly share code, notes, and snippets.

@simon-anders
Last active January 23, 2026 00:26
Show Gist options
  • Select an option

  • Save simon-anders/9feee6cf0770bdd4f22b82b9c48347f5 to your computer and use it in GitHub Desktop.

Select an option

Save simon-anders/9feee6cf0770bdd4f22b82b9c48347f5 to your computer and use it in GitHub Desktop.
use std::fs::File;
use std::io::{BufReader,BufRead,self,Write};
use std::error::Error;
use std::collections::{HashMap,VecDeque,HashSet};
use std::env;
#[derive(Debug)]
struct Interval {
chrom: i32,
start: u32,
end: u32
}
#[derive(Debug)]
struct Fragment {
iv: Interval,
cell_id: i32,
count: i32
}
#[derive(Debug)]
struct Peak {
iv: Interval,
cells: HashSet<i32>
}
struct IdMaps {
chrom_ids: HashMap<String,i32>,
cell_ids: HashMap<String,i32>,
next_chrom_id: i32,
next_cell_id: i32,
last_fragm_chrom: i32,
last_peak_chrom: i32
}
fn handle_header_line(line: &str, ids: &mut IdMaps) -> bool {
if let Some(chrom) = line.strip_prefix("# primary_contig=") {
ids.chrom_ids.insert(chrom.to_string(), ids.next_chrom_id);
ids.next_chrom_id += 1;
}
line.starts_with("#")
}
fn parse_fragm_line(line: &str, ids: &mut IdMaps) -> Result<Fragment,Box<dyn Error>> {
let fields: Vec<&str> = line.trim().split('\t').collect();
if fields.len() != 5
{ return Err(format!("Line does not have 5 fields: {}",line).into()); };
let Some(chrom_id) = ids.chrom_ids.get(fields[0])
else { return Err(format!("unknown chromosome name: {}",fields[0]).into()); };
let fragm_iv = Interval { chrom: *chrom_id,
start: fields[1].parse()?, end: fields[2].parse()? };
if fragm_iv.chrom != ids.last_fragm_chrom {
if fragm_iv.chrom < ids.last_fragm_chrom {
eprintln!("{}",ids.last_fragm_chrom);
eprintln!("{:?}",fragm_iv);
return Err( "Fragments file: Chromosomes appear in wrong order".into() )
}
ids.last_fragm_chrom = fragm_iv.chrom;
}
let cell_id = match ids.cell_ids.get(fields[3]) {
Some(id) => *id,
None => {
ids.cell_ids.insert(fields[3].to_string(), ids.next_cell_id);
ids.next_cell_id += 1;
ids.next_cell_id - 1
}
};
Ok(Fragment{iv: fragm_iv, cell_id: cell_id, count: fields[4].parse()?})
}
fn parse_peak_line(line: &str, ids: &mut IdMaps) -> Result<Interval,Box<dyn Error>> {
let fields: Vec<&str> = line.trim().split('\t').collect();
if fields.len() != 3
{ return Err(format!("Line does not have 3 fields: {}",line).into()); };
let Some(chrom_id) = ids.chrom_ids.get(fields[0])
else { return Err(format!("Unknown chromosome name in peaks file: {}",fields[0]).into()); };
if chrom_id != &ids.last_peak_chrom {
if chrom_id < &ids.last_peak_chrom {
return Err( "Peaks file: Chromosomes appear in wrong order".into() )
}
ids.last_peak_chrom = *chrom_id;
}
Ok(Interval{chrom: *chrom_id, start: fields[1].parse()?, end: fields[2].parse()? })
}
const binary: bool = false;
fn write_out(peak: &Peak, ids: &IdMaps) -> Result<(),Box<dyn Error>> {
if binary {
let mut stdout = io::stdout();
stdout.write_all(&peak.iv.chrom.to_ne_bytes())?;
stdout.write_all(&peak.iv.start.to_ne_bytes())?;
stdout.write_all(&peak.iv.end.to_ne_bytes())?;
for cell in &peak.cells {
stdout.write_all(&cell.to_ne_bytes())?;
}
stdout.write_all(&(-1i32).to_ne_bytes())?;
} else {
let Some(chrom_name) = ids.chrom_ids.iter().find_map(|(k, &v)| (v == peak.iv.chrom).then(|| k))
else {return Err("Unknown chromosome ID".into())};
print!("{}({}):{}-{} ", chrom_name, peak.iv.chrom, peak.iv.start, peak.iv.end );
for cell in &peak.cells {
print!("{} ", cell);
}
println!("");
}
Ok(())
}
fn main() -> Result<(),Box<dyn Error>> {
let args: Vec<String> = env::args().collect();
if args.len() != 3 {
return Err( "Needs 2 arguments.".into() )
}
let fragm_file = File::open(&args[1])?;
let peak_file = File::open(&args[2])?;
let mut ids = IdMaps{ chrom_ids: HashMap::new(), cell_ids: HashMap::new(),
next_chrom_id: 1, next_cell_id: 1, last_fragm_chrom: -1,
last_peak_chrom: -1 };
let mut peak_lines = BufReader::new(peak_file).lines();
let mut active_peaks: VecDeque<Peak> = VecDeque::new();
for fragm_line in BufReader::new(fragm_file).lines() {
let fragm_line = fragm_line?;
let is_header_line = handle_header_line(&fragm_line, &mut ids);
if is_header_line {
continue
}
let fragm = parse_fragm_line(&fragm_line, &mut ids)?;
//println!("{:?}", fragm);
// We need to read peaks until we find one right of the fragment
loop {
// Is the last peak to the right? If so, break
if let Some(&ref last_peak) = active_peaks.back() {
if (last_peak.iv.chrom > fragm.iv.chrom) ||
(last_peak.iv.start > fragm.iv.end) {
break;
}
}
// if not, read a new peak
let peak_line = match peak_lines.next() {
Some(line) => line?,
None => break,
};
let new_peak = Peak{ iv: parse_peak_line( &peak_line, &mut ids )?,
cells: HashSet::new() };
//println!("IN: {:?}", new_peak);
active_peaks.push_back( new_peak );
}
// Remove all peaks left of the fragment
loop {
// Is the first peak to the left? If so, remove it
if let Some(&ref first_peak) = active_peaks.front() {
if (first_peak.iv.chrom < fragm.iv.chrom) ||
(first_peak.iv.end < fragm.iv.start) {
//println!("OUT: {:?}", first_peak);
write_out(first_peak, &ids)?;
active_peaks.pop_front();
continue
}
}
break
}
// Now find the fitting peak
let mut peaks_found = 0;
for peak in &mut active_peaks {
if (peak.iv.start <= fragm.iv.end) && (peak.iv.end >= fragm.iv.start) {
peak.cells.insert(fragm.cell_id);
//println!("Cell {:?} in Peak {:?}", fragm.cell_id, peak.iv);
peaks_found += 1;
}
}
/*
if peaks_found > 1 {
return Err( "overlapping peaks".into());
} */
}
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment