Last active
January 23, 2026 00:26
-
-
Save simon-anders/9feee6cf0770bdd4f22b82b9c48347f5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use std::fs::File; | |
| use std::io::{BufReader,BufRead,self,Write}; | |
| use std::error::Error; | |
| use std::collections::{HashMap,VecDeque,HashSet}; | |
| use std::env; | |
| #[derive(Debug)] | |
| struct Interval { | |
| chrom: i32, | |
| start: u32, | |
| end: u32 | |
| } | |
| #[derive(Debug)] | |
| struct Fragment { | |
| iv: Interval, | |
| cell_id: i32, | |
| count: i32 | |
| } | |
| #[derive(Debug)] | |
| struct Peak { | |
| iv: Interval, | |
| cells: HashSet<i32> | |
| } | |
| struct IdMaps { | |
| chrom_ids: HashMap<String,i32>, | |
| cell_ids: HashMap<String,i32>, | |
| next_chrom_id: i32, | |
| next_cell_id: i32, | |
| last_fragm_chrom: i32, | |
| last_peak_chrom: i32 | |
| } | |
| fn handle_header_line(line: &str, ids: &mut IdMaps) -> bool { | |
| if let Some(chrom) = line.strip_prefix("# primary_contig=") { | |
| ids.chrom_ids.insert(chrom.to_string(), ids.next_chrom_id); | |
| ids.next_chrom_id += 1; | |
| } | |
| line.starts_with("#") | |
| } | |
| fn parse_fragm_line(line: &str, ids: &mut IdMaps) -> Result<Fragment,Box<dyn Error>> { | |
| let fields: Vec<&str> = line.trim().split('\t').collect(); | |
| if fields.len() != 5 | |
| { return Err(format!("Line does not have 5 fields: {}",line).into()); }; | |
| let Some(chrom_id) = ids.chrom_ids.get(fields[0]) | |
| else { return Err(format!("unknown chromosome name: {}",fields[0]).into()); }; | |
| let fragm_iv = Interval { chrom: *chrom_id, | |
| start: fields[1].parse()?, end: fields[2].parse()? }; | |
| if fragm_iv.chrom != ids.last_fragm_chrom { | |
| if fragm_iv.chrom < ids.last_fragm_chrom { | |
| eprintln!("{}",ids.last_fragm_chrom); | |
| eprintln!("{:?}",fragm_iv); | |
| return Err( "Fragments file: Chromosomes appear in wrong order".into() ) | |
| } | |
| ids.last_fragm_chrom = fragm_iv.chrom; | |
| } | |
| let cell_id = match ids.cell_ids.get(fields[3]) { | |
| Some(id) => *id, | |
| None => { | |
| ids.cell_ids.insert(fields[3].to_string(), ids.next_cell_id); | |
| ids.next_cell_id += 1; | |
| ids.next_cell_id - 1 | |
| } | |
| }; | |
| Ok(Fragment{iv: fragm_iv, cell_id: cell_id, count: fields[4].parse()?}) | |
| } | |
| fn parse_peak_line(line: &str, ids: &mut IdMaps) -> Result<Interval,Box<dyn Error>> { | |
| let fields: Vec<&str> = line.trim().split('\t').collect(); | |
| if fields.len() != 3 | |
| { return Err(format!("Line does not have 3 fields: {}",line).into()); }; | |
| let Some(chrom_id) = ids.chrom_ids.get(fields[0]) | |
| else { return Err(format!("Unknown chromosome name in peaks file: {}",fields[0]).into()); }; | |
| if chrom_id != &ids.last_peak_chrom { | |
| if chrom_id < &ids.last_peak_chrom { | |
| return Err( "Peaks file: Chromosomes appear in wrong order".into() ) | |
| } | |
| ids.last_peak_chrom = *chrom_id; | |
| } | |
| Ok(Interval{chrom: *chrom_id, start: fields[1].parse()?, end: fields[2].parse()? }) | |
| } | |
| const binary: bool = false; | |
| fn write_out(peak: &Peak, ids: &IdMaps) -> Result<(),Box<dyn Error>> { | |
| if binary { | |
| let mut stdout = io::stdout(); | |
| stdout.write_all(&peak.iv.chrom.to_ne_bytes())?; | |
| stdout.write_all(&peak.iv.start.to_ne_bytes())?; | |
| stdout.write_all(&peak.iv.end.to_ne_bytes())?; | |
| for cell in &peak.cells { | |
| stdout.write_all(&cell.to_ne_bytes())?; | |
| } | |
| stdout.write_all(&(-1i32).to_ne_bytes())?; | |
| } else { | |
| let Some(chrom_name) = ids.chrom_ids.iter().find_map(|(k, &v)| (v == peak.iv.chrom).then(|| k)) | |
| else {return Err("Unknown chromosome ID".into())}; | |
| print!("{}({}):{}-{} ", chrom_name, peak.iv.chrom, peak.iv.start, peak.iv.end ); | |
| for cell in &peak.cells { | |
| print!("{} ", cell); | |
| } | |
| println!(""); | |
| } | |
| Ok(()) | |
| } | |
| fn main() -> Result<(),Box<dyn Error>> { | |
| let args: Vec<String> = env::args().collect(); | |
| if args.len() != 3 { | |
| return Err( "Needs 2 arguments.".into() ) | |
| } | |
| let fragm_file = File::open(&args[1])?; | |
| let peak_file = File::open(&args[2])?; | |
| let mut ids = IdMaps{ chrom_ids: HashMap::new(), cell_ids: HashMap::new(), | |
| next_chrom_id: 1, next_cell_id: 1, last_fragm_chrom: -1, | |
| last_peak_chrom: -1 }; | |
| let mut peak_lines = BufReader::new(peak_file).lines(); | |
| let mut active_peaks: VecDeque<Peak> = VecDeque::new(); | |
| for fragm_line in BufReader::new(fragm_file).lines() { | |
| let fragm_line = fragm_line?; | |
| let is_header_line = handle_header_line(&fragm_line, &mut ids); | |
| if is_header_line { | |
| continue | |
| } | |
| let fragm = parse_fragm_line(&fragm_line, &mut ids)?; | |
| //println!("{:?}", fragm); | |
| // We need to read peaks until we find one right of the fragment | |
| loop { | |
| // Is the last peak to the right? If so, break | |
| if let Some(&ref last_peak) = active_peaks.back() { | |
| if (last_peak.iv.chrom > fragm.iv.chrom) || | |
| (last_peak.iv.start > fragm.iv.end) { | |
| break; | |
| } | |
| } | |
| // if not, read a new peak | |
| let peak_line = match peak_lines.next() { | |
| Some(line) => line?, | |
| None => break, | |
| }; | |
| let new_peak = Peak{ iv: parse_peak_line( &peak_line, &mut ids )?, | |
| cells: HashSet::new() }; | |
| //println!("IN: {:?}", new_peak); | |
| active_peaks.push_back( new_peak ); | |
| } | |
| // Remove all peaks left of the fragment | |
| loop { | |
| // Is the first peak to the left? If so, remove it | |
| if let Some(&ref first_peak) = active_peaks.front() { | |
| if (first_peak.iv.chrom < fragm.iv.chrom) || | |
| (first_peak.iv.end < fragm.iv.start) { | |
| //println!("OUT: {:?}", first_peak); | |
| write_out(first_peak, &ids)?; | |
| active_peaks.pop_front(); | |
| continue | |
| } | |
| } | |
| break | |
| } | |
| // Now find the fitting peak | |
| let mut peaks_found = 0; | |
| for peak in &mut active_peaks { | |
| if (peak.iv.start <= fragm.iv.end) && (peak.iv.end >= fragm.iv.start) { | |
| peak.cells.insert(fragm.cell_id); | |
| //println!("Cell {:?} in Peak {:?}", fragm.cell_id, peak.iv); | |
| peaks_found += 1; | |
| } | |
| } | |
| /* | |
| if peaks_found > 1 { | |
| return Err( "overlapping peaks".into()); | |
| } */ | |
| } | |
| Ok(()) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment