ayaksvals/csv_sort_parquet(DaskVersion) (3).ipynb

## csv_sort_parquet(DaskVersion) (3).ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d30d1c6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import bioframe\n",
    "import pypairix\n",
    "import dask.dataframe as dd\n",
    "import dask.array as da\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import numba\n",
    "\n",
    "import pypairix\n",
    "import pysam\n",
    "from dask.base import tokenize\n",
    "import dask.dataframe as dd\n",
    "import dask.array as da\n",
    "import dask\n",
    "from dask.dataframe.core import new_dd_object\n",
    "from dask.delayed import delayed\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f6356134",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3min 21s, sys: 51.2 s, total: 4min 12s\n",
      "Wall time: 4min 12s\n"
     ]
    }
   ],
   "source": [
    "dtype_dict = {\n",
    "    'read_id': 'str',\n",
    "    'chrom1': 'str',\n",
    "    'pos1': 'Int64',\n",
    "    'chrom2': 'str',\n",
    "    'pos2': 'Int64',\n",
    "    'strand1': 'str',\n",
    "    'strand2': 'str',\n",
    "    'pair_type': 'str'\n",
    "}\n",
    "%time dfs=delayed(pd.read_csv('NIPBL_R1.nodups.pairs.gz', sep='\\t',  skiprows=200, on_bad_lines='skip', names=['read_id', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type'], dtype=dtype_dict))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "71366242",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12.6 ms, sys: 9.64 ms, total: 22.2 ms\n",
      "Wall time: 23.1 ms\n"
     ]
    }
   ],
   "source": [
    "%time df = dd.from_delayed(dfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cbf7c254",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 6.06 ms, sys: 10 ms, total: 16.1 ms\n",
      "Wall time: 14.5 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002468</td>\n",
       "      <td>chr1</td>\n",
       "      <td>7419565</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002505</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002639</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002505</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002644</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002505</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002683</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002505</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002699</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220629</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902279</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902428</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220630</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902282</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902487</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220631</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902286</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902430</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220632</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902375</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902555</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220633</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902405</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902560</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>65220634 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         read_id chrom1     pos1 chrom2     pos2 strand1 strand2 pair_type\n",
       "0              .   chr1  3002468   chr1  7419565       +       -        LL\n",
       "1              .   chr1  3002505   chr1  3002639       +       -        LL\n",
       "2              .   chr1  3002505   chr1  3002644       +       -        LL\n",
       "3              .   chr1  3002505   chr1  3002683       +       -        LL\n",
       "4              .   chr1  3002505   chr1  3002699       +       -        LL\n",
       "...          ...    ...      ...    ...      ...     ...     ...       ...\n",
       "65220629       .   chrY  2902279   chrY  2902428       +       -        LL\n",
       "65220630       .   chrY  2902282   chrY  2902487       +       -        LL\n",
       "65220631       .   chrY  2902286   chrY  2902430       +       -        LL\n",
       "65220632       .   chrY  2902375   chrY  2902555       +       -        LL\n",
       "65220633       .   chrY  2902405   chrY  2902560       +       -        LL\n",
       "\n",
       "[65220634 rows x 8 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time df.compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9e866592",
   "metadata": {},
   "outputs": [],
   "source": [
    "pairs_concat=df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a331f83f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#pairs_concat=pairs_concat.compute()\n",
    "#pairs_concat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f074accb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom sorting function to sort chromosomes\n",
    "def chromosome_sort_key(chrom):\n",
    "    if chrom == 'chrX':\n",
    "        return 100\n",
    "    elif chrom == 'chrY':\n",
    "        return 101\n",
    "    elif chrom == 'chrM':\n",
    "        return 102\n",
    "    else:\n",
    "        return int(chrom[3:])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "4c0f077b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.3 ms, sys: 1.08 ms, total: 4.38 ms\n",
      "Wall time: 3.81 ms\n"
     ]
    }
   ],
   "source": [
    "# ADDED SORT PART\n",
    "\n",
    "# Variant 1: chrom1 and chrom2 1,10,2\n",
    "# Sort Time+Compute=7m 50s; to parquet=21s\n",
    "# Sort: 0s, Compute+to parquet: 14m\n",
    "#sorted_df = pairs_concat.sort_values(by=['chrom1', 'chrom2', 'pos1', 'pos2', 'strand1', 'strand2'])\n",
    "\n",
    "# Variant 2:chrom1 and chrom2 1,2,10\n",
    "%time sorted_df = pairs_concat.map_partitions(lambda df: df.assign(chrom1_key=df['chrom1'].map(chromosome_sort_key)).assign(chrom2_key=df['chrom2'].map(chromosome_sort_key)).sort_values(by=['chrom1_key', 'chrom2_key', 'pos1', 'pos2', 'strand1', 'strand2']).drop(columns='chrom2_key').drop(columns='chrom1_key'), meta={'read_id':'str','chrom1': 'str', 'pos1':int, 'chrom2': 'str','pos2':int,'strand1': 'str', 'strand2': 'str', 'pair_type':'str'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "cbdc3013",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5min 25s, sys: 2min 3s, total: 7min 28s\n",
      "Wall time: 7min 28s\n"
     ]
    }
   ],
   "source": [
    "#Each partition will be written to a separate file. => Not what we are looking for\n",
    "\"\"\" \n",
    "path : string or pathlib.Path\n",
    "        Destination directory for data.  Prepend with protocol like ``s3://``\n",
    "        or ``hdfs://`` for remote data.\n",
    "compression : string or dict, default 'snappy'\n",
    "        Either a string like ``\"snappy\"`` or a dictionary mapping column names\n",
    "        to compressors like ``{\"name\": \"gzip\", \"values\": \"snappy\"}``. Defaults\n",
    "        to ``\"snappy\"``.\n",
    "compute : bool, default True\n",
    "        If ``True`` (default) then the result is computed immediately. If\n",
    "        ``False`` then a ``dask.dataframe.Scalar`` object is returned for\n",
    "        future computation.\n",
    "schema : pyarrow.Schema, dict, \"infer\", or None, default \"infer\"\n",
    "        Global schema to use for the output dataset. Defaults to \"infer\", which\n",
    "        will infer the schema from the dask dataframe metadata. This is usually\n",
    "        sufficient for common schemas, but notably will fail for ``object``\n",
    "        dtype columns that contain things other than strings. These columns\n",
    "        will require an explicit schema be specified. The schema for a subset\n",
    "        of columns can be overridden by passing in a dict of column names to\n",
    "        pyarrow types (for example ``schema={\"field\": pa.string()}``); columns\n",
    "        not present in this dict will still be automatically inferred.\n",
    "        Alternatively, a full ``pyarrow.Schema`` may be passed, in which case\n",
    "        no schema inference will be done. Passing in ``schema=None`` will\n",
    "        disable the use of a global file schema - each written file may use a\n",
    "        different schema dependent on the dtypes of the corresponding\n",
    "        partition.\n",
    "\"\"\"\n",
    "\n",
    "%time out = sorted_df.to_parquet('test.parquet', compression='snappy', compute = True) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f15833ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=7</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>Int64</td>\n",
       "      <td>string</td>\n",
       "      <td>Int64</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<div>Dask Name: read_parquet, 1 expression</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "              read_id  chrom1   pos1  chrom2   pos2 strand1 strand2 pair_type\n",
       "npartitions=7                                                                \n",
       "               string  string  Int64  string  Int64  string  string    string\n",
       "                  ...     ...    ...     ...    ...     ...     ...       ...\n",
       "...               ...     ...    ...     ...    ...     ...     ...       ...\n",
       "                  ...     ...    ...     ...    ...     ...     ...       ...\n",
       "                  ...     ...    ...     ...    ...     ...     ...       ...\n",
       "Dask Name: read_parquet, 1 expression\n",
       "Expr=ReadParquetFSSpec(f66a268)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time df = dd.read_parquet('test.parquet')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "a9811098",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.29 ms, sys: 2.43 ms, total: 10.7 ms\n",
      "Wall time: 11.7 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002468</td>\n",
       "      <td>chr1</td>\n",
       "      <td>7419565</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002505</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002639</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002505</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002644</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002505</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002683</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002505</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3002699</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220629</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902279</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902428</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220630</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902282</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902487</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220631</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902286</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902430</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220632</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902375</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902555</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65220633</th>\n",
       "      <td>.</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902405</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2902560</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>65220634 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         read_id chrom1     pos1 chrom2     pos2 strand1 strand2 pair_type\n",
       "0              .   chr1  3002468   chr1  7419565       +       -        LL\n",
       "1              .   chr1  3002505   chr1  3002639       +       -        LL\n",
       "2              .   chr1  3002505   chr1  3002644       +       -        LL\n",
       "3              .   chr1  3002505   chr1  3002683       +       -        LL\n",
       "4              .   chr1  3002505   chr1  3002699       +       -        LL\n",
       "...          ...    ...      ...    ...      ...     ...     ...       ...\n",
       "65220629       .   chrY  2902279   chrY  2902428       +       -        LL\n",
       "65220630       .   chrY  2902282   chrY  2902487       +       -        LL\n",
       "65220631       .   chrY  2902286   chrY  2902430       +       -        LL\n",
       "65220632       .   chrY  2902375   chrY  2902555       +       -        LL\n",
       "65220633       .   chrY  2902405   chrY  2902560       +       -        LL\n",
       "\n",
       "[65220634 rows x 8 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time df.compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0e6ca625",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   chromosomes  chrom_sort_key\n",
      "0         chr1               1\n",
      "6         chr2               2\n",
      "11        chr3               3\n",
      "12        chr4               4\n",
      "2         chr5               5\n",
      "3         chr6               6\n",
      "7         chr7               7\n",
      "8         chr8               8\n",
      "9         chr9               9\n",
      "10       chr10              10\n",
      "13       chr11              11\n",
      "14       chr12              12\n",
      "15       chr13              13\n",
      "16       chr14              14\n",
      "17       chr15              15\n",
      "18       chr16              16\n",
      "19       chr17              17\n",
      "20       chr18              18\n",
      "4        chr19              19\n",
      "1         chrX             100\n",
      "5         chrM             102\n"
     ]
    }
   ],
   "source": [
    "#TEST BLOCK\n",
    "\n",
    "data = {\n",
    "    'chromosomes': ['chr1', 'chrX', 'chr5', 'chr6', 'chr19', 'chrM', \n",
    "                    'chr2', 'chr7', 'chr8', 'chr9', 'chr10', 'chr3', \n",
    "                    'chr4', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', \n",
    "                    'chr16', 'chr17', 'chr18']\n",
    "}\n",
    "\n",
    "pdf = pd.DataFrame(data)\n",
    "df = dd.from_pandas(pdf)\n",
    "# Sort the DataFrame by the 'chromosomes' column\n",
    "sorted_df = df.map_partitions(lambda df: df.assign(chrom_sort_key=df['chromosomes'].map(chromosome_sort_key))\n",
    "                                .sort_values('chrom_sort_key'),\n",
    "                              meta={'chromosomes': 'object', 'chrom_sort_key': int})\n",
    "\n",
    "# Compute to get the sorted result\n",
    "result = sorted_df.compute()\n",
    "\n",
    "print(result)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "main",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "d30d1c6f",
	"metadata": {},
	"outputs": [],
	"source": [
	"import bioframe\n",
	"import pypairix\n",
	"import dask.dataframe as dd\n",
	"import dask.array as da\n",
	"\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import numba\n",
	"\n",
	"import pypairix\n",
	"import pysam\n",
	"from dask.base import tokenize\n",
	"import dask.dataframe as dd\n",
	"import dask.array as da\n",
	"import dask\n",
	"from dask.dataframe.core import new_dd_object\n",
	"from dask.delayed import delayed\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "f6356134",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 3min 21s, sys: 51.2 s, total: 4min 12s\n",
	"Wall time: 4min 12s\n"
	]
	}
	],
	"source": [
	"dtype_dict = {\n",
	" 'read_id': 'str',\n",
	" 'chrom1': 'str',\n",
	" 'pos1': 'Int64',\n",
	" 'chrom2': 'str',\n",
	" 'pos2': 'Int64',\n",
	" 'strand1': 'str',\n",
	" 'strand2': 'str',\n",
	" 'pair_type': 'str'\n",
	"}\n",
	"%time dfs=delayed(pd.read_csv('NIPBL_R1.nodups.pairs.gz', sep='\\t', skiprows=200, on_bad_lines='skip', names=['read_id', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type'], dtype=dtype_dict))\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "71366242",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 12.6 ms, sys: 9.64 ms, total: 22.2 ms\n",
	"Wall time: 23.1 ms\n"
	]
	}
	],
	"source": [
	"%time df = dd.from_delayed(dfs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "cbf7c254",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 6.06 ms, sys: 10 ms, total: 16.1 ms\n",
	"Wall time: 14.5 ms\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002468</td>\n",
	" <td>chr1</td>\n",
	" <td>7419565</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002505</td>\n",
	" <td>chr1</td>\n",
	" <td>3002639</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002505</td>\n",
	" <td>chr1</td>\n",
	" <td>3002644</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002505</td>\n",
	" <td>chr1</td>\n",
	" <td>3002683</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002505</td>\n",
	" <td>chr1</td>\n",
	" <td>3002699</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220629</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902279</td>\n",
	" <td>chrY</td>\n",
	" <td>2902428</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220630</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902282</td>\n",
	" <td>chrY</td>\n",
	" <td>2902487</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220631</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902286</td>\n",
	" <td>chrY</td>\n",
	" <td>2902430</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220632</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902375</td>\n",
	" <td>chrY</td>\n",
	" <td>2902555</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220633</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902405</td>\n",
	" <td>chrY</td>\n",
	" <td>2902560</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>65220634 rows × 8 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"0 . chr1 3002468 chr1 7419565 + - LL\n",
	"1 . chr1 3002505 chr1 3002639 + - LL\n",
	"2 . chr1 3002505 chr1 3002644 + - LL\n",
	"3 . chr1 3002505 chr1 3002683 + - LL\n",
	"4 . chr1 3002505 chr1 3002699 + - LL\n",
	"... ... ... ... ... ... ... ... ...\n",
	"65220629 . chrY 2902279 chrY 2902428 + - LL\n",
	"65220630 . chrY 2902282 chrY 2902487 + - LL\n",
	"65220631 . chrY 2902286 chrY 2902430 + - LL\n",
	"65220632 . chrY 2902375 chrY 2902555 + - LL\n",
	"65220633 . chrY 2902405 chrY 2902560 + - LL\n",
	"\n",
	"[65220634 rows x 8 columns]"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%time df.compute()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "9e866592",
	"metadata": {},
	"outputs": [],
	"source": [
	"pairs_concat=df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "a331f83f",
	"metadata": {},
	"outputs": [],
	"source": [
	"#pairs_concat=pairs_concat.compute()\n",
	"#pairs_concat"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "f074accb",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Custom sorting function to sort chromosomes\n",
	"def chromosome_sort_key(chrom):\n",
	" if chrom == 'chrX':\n",
	" return 100\n",
	" elif chrom == 'chrY':\n",
	" return 101\n",
	" elif chrom == 'chrM':\n",
	" return 102\n",
	" else:\n",
	" return int(chrom[3:])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"id": "4c0f077b",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 3.3 ms, sys: 1.08 ms, total: 4.38 ms\n",
	"Wall time: 3.81 ms\n"
	]
	}
	],
	"source": [
	"# ADDED SORT PART\n",
	"\n",
	"# Variant 1: chrom1 and chrom2 1,10,2\n",
	"# Sort Time+Compute=7m 50s; to parquet=21s\n",
	"# Sort: 0s, Compute+to parquet: 14m\n",
	"#sorted_df = pairs_concat.sort_values(by=['chrom1', 'chrom2', 'pos1', 'pos2', 'strand1', 'strand2'])\n",
	"\n",
	"# Variant 2:chrom1 and chrom2 1,2,10\n",
	"%time sorted_df = pairs_concat.map_partitions(lambda df: df.assign(chrom1_key=df['chrom1'].map(chromosome_sort_key)).assign(chrom2_key=df['chrom2'].map(chromosome_sort_key)).sort_values(by=['chrom1_key', 'chrom2_key', 'pos1', 'pos2', 'strand1', 'strand2']).drop(columns='chrom2_key').drop(columns='chrom1_key'), meta={'read_id':'str','chrom1': 'str', 'pos1':int, 'chrom2': 'str','pos2':int,'strand1': 'str', 'strand2': 'str', 'pair_type':'str'})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"id": "cbdc3013",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 5min 25s, sys: 2min 3s, total: 7min 28s\n",
	"Wall time: 7min 28s\n"
	]
	}
	],
	"source": [
	"#Each partition will be written to a separate file. => Not what we are looking for\n",
	"\"\"\" \n",
	"path : string or pathlib.Path\n",
	" Destination directory for data. Prepend with protocol like ``s3://``\n",
	" or ``hdfs://`` for remote data.\n",
	"compression : string or dict, default 'snappy'\n",
	" Either a string like ``\"snappy\"`` or a dictionary mapping column names\n",
	" to compressors like ``{\"name\": \"gzip\", \"values\": \"snappy\"}``. Defaults\n",
	" to ``\"snappy\"``.\n",
	"compute : bool, default True\n",
	" If ``True`` (default) then the result is computed immediately. If\n",
	" ``False`` then a ``dask.dataframe.Scalar`` object is returned for\n",
	" future computation.\n",
	"schema : pyarrow.Schema, dict, \"infer\", or None, default \"infer\"\n",
	" Global schema to use for the output dataset. Defaults to \"infer\", which\n",
	" will infer the schema from the dask dataframe metadata. This is usually\n",
	" sufficient for common schemas, but notably will fail for ``object``\n",
	" dtype columns that contain things other than strings. These columns\n",
	" will require an explicit schema be specified. The schema for a subset\n",
	" of columns can be overridden by passing in a dict of column names to\n",
	" pyarrow types (for example ``schema={\"field\": pa.string()}``); columns\n",
	" not present in this dict will still be automatically inferred.\n",
	" Alternatively, a full ``pyarrow.Schema`` may be passed, in which case\n",
	" no schema inference will be done. Passing in ``schema=None`` will\n",
	" disable the use of a global file schema - each written file may use a\n",
	" different schema dependent on the dtypes of the corresponding\n",
	" partition.\n",
	"\"\"\"\n",
	"\n",
	"%time out = sorted_df.to_parquet('test.parquet', compression='snappy', compute = True) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "f15833ff",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div><strong>Dask DataFrame Structure:</strong></div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>npartitions=7</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>Int64</td>\n",
	" <td>string</td>\n",
	" <td>Int64</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<div>Dask Name: read_parquet, 1 expression</div>"
	],
	"text/plain": [
	"Dask DataFrame Structure:\n",
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"npartitions=7 \n",
	" string string Int64 string Int64 string string string\n",
	" ... ... ... ... ... ... ... ...\n",
	"... ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	"Dask Name: read_parquet, 1 expression\n",
	"Expr=ReadParquetFSSpec(f66a268)"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%time df = dd.read_parquet('test.parquet')\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "a9811098",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 8.29 ms, sys: 2.43 ms, total: 10.7 ms\n",
	"Wall time: 11.7 ms\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002468</td>\n",
	" <td>chr1</td>\n",
	" <td>7419565</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002505</td>\n",
	" <td>chr1</td>\n",
	" <td>3002639</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002505</td>\n",
	" <td>chr1</td>\n",
	" <td>3002644</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002505</td>\n",
	" <td>chr1</td>\n",
	" <td>3002683</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3002505</td>\n",
	" <td>chr1</td>\n",
	" <td>3002699</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220629</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902279</td>\n",
	" <td>chrY</td>\n",
	" <td>2902428</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220630</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902282</td>\n",
	" <td>chrY</td>\n",
	" <td>2902487</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220631</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902286</td>\n",
	" <td>chrY</td>\n",
	" <td>2902430</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220632</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902375</td>\n",
	" <td>chrY</td>\n",
	" <td>2902555</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>65220633</th>\n",
	" <td>.</td>\n",
	" <td>chrY</td>\n",
	" <td>2902405</td>\n",
	" <td>chrY</td>\n",
	" <td>2902560</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>65220634 rows × 8 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"0 . chr1 3002468 chr1 7419565 + - LL\n",
	"1 . chr1 3002505 chr1 3002639 + - LL\n",
	"2 . chr1 3002505 chr1 3002644 + - LL\n",
	"3 . chr1 3002505 chr1 3002683 + - LL\n",
	"4 . chr1 3002505 chr1 3002699 + - LL\n",
	"... ... ... ... ... ... ... ... ...\n",
	"65220629 . chrY 2902279 chrY 2902428 + - LL\n",
	"65220630 . chrY 2902282 chrY 2902487 + - LL\n",
	"65220631 . chrY 2902286 chrY 2902430 + - LL\n",
	"65220632 . chrY 2902375 chrY 2902555 + - LL\n",
	"65220633 . chrY 2902405 chrY 2902560 + - LL\n",
	"\n",
	"[65220634 rows x 8 columns]"
	]
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%time df.compute()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "0e6ca625",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" chromosomes chrom_sort_key\n",
	"0 chr1 1\n",
	"6 chr2 2\n",
	"11 chr3 3\n",
	"12 chr4 4\n",
	"2 chr5 5\n",
	"3 chr6 6\n",
	"7 chr7 7\n",
	"8 chr8 8\n",
	"9 chr9 9\n",
	"10 chr10 10\n",
	"13 chr11 11\n",
	"14 chr12 12\n",
	"15 chr13 13\n",
	"16 chr14 14\n",
	"17 chr15 15\n",
	"18 chr16 16\n",
	"19 chr17 17\n",
	"20 chr18 18\n",
	"4 chr19 19\n",
	"1 chrX 100\n",
	"5 chrM 102\n"
	]
	}
	],
	"source": [
	"#TEST BLOCK\n",
	"\n",
	"data = {\n",
	" 'chromosomes': ['chr1', 'chrX', 'chr5', 'chr6', 'chr19', 'chrM', \n",
	" 'chr2', 'chr7', 'chr8', 'chr9', 'chr10', 'chr3', \n",
	" 'chr4', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', \n",
	" 'chr16', 'chr17', 'chr18']\n",
	"}\n",
	"\n",
	"pdf = pd.DataFrame(data)\n",
	"df = dd.from_pandas(pdf)\n",
	"# Sort the DataFrame by the 'chromosomes' column\n",
	"sorted_df = df.map_partitions(lambda df: df.assign(chrom_sort_key=df['chromosomes'].map(chromosome_sort_key))\n",
	" .sort_values('chrom_sort_key'),\n",
	" meta={'chromosomes': 'object', 'chrom_sort_key': int})\n",
	"\n",
	"# Compute to get the sorted result\n",
	"result = sorted_df.compute()\n",
	"\n",
	"print(result)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "main",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.19"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found