Mohamed Abuelanin mr-eyes

## README.md

      
              4 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                mr-eyes
                / README.md
            
            
              Last active
              April 2, 2025 22:24
            
              
                CHM13v2.0 Segmental Duplications
              
          
    CLICK HERE TO VISUALIZE

node_metadata.tsv

A table with one row per unique genomic region (node) involved in a structural duplication.


Column
Description


id
Unique ID for the genomic region (chr:start-end)


gene_name
Ensembl or Liftoff-assigned gene name overlapping this region (if a


## code.gs
function addOrUpdateAgendaBarWithProgressV5() {
  const pres   = SlidesApp.getActivePresentation();
  const slides = pres.getSlides();

  // 0) Clear any existing timeline shapes on ALL slides
  slides.forEach(slide => {
    slide.getPageElements().forEach(el => {
      const t = el.getTitle && el.getTitle();
      if (t && t.startsWith('AGENDA_')) {
        el.remove();

## environment.yml
name: kspider
channels:
  - conda-forge
  - bioconda
dependencies:
  - python=3.9
  - pip
  - sourmash
  - pip:
    - kSpider

## sourmash_ani.cpp
#include <Python.h>
#include <iostream>

using namespace std;

class toANI {
public:
    PyObject* moduleMainString, * moduleMain, * func;

    toANI() {

## prefetch_clustering.py
import retworkx as rx
from tqdm import tqdm
import argparse

parser = argparse.ArgumentParser()

parser.add_argument('--csv', type=str, required=True, help="pairwise csv file")
parser.add_argument('--cutoff', type=int, required=True,
                    help="clustering threshold (0:100)")
parser.add_argument('--mode', type=str, required=True, choices=['ani', 'cont'],

## nearest_genomes.py
"""
Using the new NCBI's datasets API to get the nearest available reference genomes for a given taxon or organism name.

Input: TAX_ID or Name

Output:
    1- The nearest organism with available reference genomes
    2- Accessions of the reference genomes

Requirements:

## annotations.txt
parent	metadata
p1	DB_A
p2	DB_B
p3	DB_C
p4	DB_A
p5	DB_A
p6	DB_A
p7	DB_B
p8	DB_C
p9	DB_C

## kProcessor_index_validator.py
# Validate kProcessor 1 index

from itertools import groupby
import os
import kProcessor as kp
import hashlib


class IntegralHasher:

## unitigs_to_connected_components.py
"""
Input: Unitigs Fasta file generated from BCALM.
Output: CSV File with the following format

A. Column(1): Connected Component ID
B. Column(2:): Unitigs ID(s)

Run:
python unitigs_to_connected_components.py <unitigs_path>
"""

## clusters.tsv

          
            1
            2
            2
            1
            3

            
              1
              2
              3
              4
              5
Column	Description
`id`	Unique ID for the genomic region (`chr:start-end`)
`gene_name`	Ensembl or Liftoff-assigned gene name overlapping this region (if a
	function addOrUpdateAgendaBarWithProgressV5() {
	const pres = SlidesApp.getActivePresentation();
	const slides = pres.getSlides();

	// 0) Clear any existing timeline shapes on ALL slides
	slides.forEach(slide => {
	slide.getPageElements().forEach(el => {
	const t = el.getTitle && el.getTitle();
	if (t && t.startsWith('AGENDA_')) {
	el.remove();
	name: kspider
	channels:
	- conda-forge
	- bioconda
	dependencies:
	- python=3.9
	- pip
	- sourmash
	- pip:
	- kSpider
	#include <Python.h>
	#include <iostream>

	using namespace std;

	class toANI {
	public:
	PyObject* moduleMainString, * moduleMain, * func;

	toANI() {
	import retworkx as rx
	from tqdm import tqdm
	import argparse

	parser = argparse.ArgumentParser()

	parser.add_argument('--csv', type=str, required=True, help="pairwise csv file")
	parser.add_argument('--cutoff', type=int, required=True,
	help="clustering threshold (0:100)")
	parser.add_argument('--mode', type=str, required=True, choices=['ani', 'cont'],
	"""
	Using the new NCBI's datasets API to get the nearest available reference genomes for a given taxon or organism name.

	Input: TAX_ID or Name

	Output:
	1- The nearest organism with available reference genomes
	2- Accessions of the reference genomes

	Requirements:
	parent metadata
	p1 DB_A
	p2 DB_B
	p3 DB_C
	p4 DB_A
	p5 DB_A
	p6 DB_A
	p7 DB_B
	p8 DB_C
	p9 DB_C
	# Validate kProcessor 1 index

	from itertools import groupby
	import os
	import kProcessor as kp
	import hashlib



	class IntegralHasher:
	"""
	Input: Unitigs Fasta file generated from BCALM.
	Output: CSV File with the following format

	A. Column(1): Connected Component ID
	B. Column(2:): Unitigs ID(s)

	Run:
	python unitigs_to_connected_components.py <unitigs_path>
	"""