Last active
August 19, 2024 16:01
-
-
Save ayaksvals/bea901efee88e458ffd5c93fc152004c to your computer and use it in GitHub Desktop.
Read Csv, sort, save to parquet. TO FIX: dtypes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "d30d1c6f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import bioframe\n", | |
| "import pypairix\n", | |
| "import dask.dataframe as dd\n", | |
| "import dask.array as da\n", | |
| "\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "import numba\n", | |
| "\n", | |
| "import pypairix\n", | |
| "import pysam\n", | |
| "from dask.base import tokenize\n", | |
| "import dask.dataframe as dd\n", | |
| "import dask.array as da\n", | |
| "import dask\n", | |
| "from dask.dataframe.core import new_dd_object\n", | |
| "from dask.delayed import delayed\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "f6356134", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 3min 21s, sys: 51.2 s, total: 4min 12s\n", | |
| "Wall time: 4min 12s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "dtype_dict = {\n", | |
| " 'read_id': 'str',\n", | |
| " 'chrom1': 'str',\n", | |
| " 'pos1': 'Int64',\n", | |
| " 'chrom2': 'str',\n", | |
| " 'pos2': 'Int64',\n", | |
| " 'strand1': 'str',\n", | |
| " 'strand2': 'str',\n", | |
| " 'pair_type': 'str'\n", | |
| "}\n", | |
| "%time dfs=delayed(pd.read_csv('NIPBL_R1.nodups.pairs.gz', sep='\\t', skiprows=200, on_bad_lines='skip', names=['read_id', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type'], dtype=dtype_dict))\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "71366242", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 12.6 ms, sys: 9.64 ms, total: 22.2 ms\n", | |
| "Wall time: 23.1 ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%time df = dd.from_delayed(dfs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "cbf7c254", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 6.06 ms, sys: 10 ms, total: 16.1 ms\n", | |
| "Wall time: 14.5 ms\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>read_id</th>\n", | |
| " <th>chrom1</th>\n", | |
| " <th>pos1</th>\n", | |
| " <th>chrom2</th>\n", | |
| " <th>pos2</th>\n", | |
| " <th>strand1</th>\n", | |
| " <th>strand2</th>\n", | |
| " <th>pair_type</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002468</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>7419565</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002505</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002639</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002505</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002644</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002505</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002683</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002505</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002699</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220629</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902279</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902428</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220630</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902282</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902487</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220631</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902286</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902430</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220632</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902375</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902555</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220633</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902405</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902560</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>65220634 rows × 8 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n", | |
| "0 . chr1 3002468 chr1 7419565 + - LL\n", | |
| "1 . chr1 3002505 chr1 3002639 + - LL\n", | |
| "2 . chr1 3002505 chr1 3002644 + - LL\n", | |
| "3 . chr1 3002505 chr1 3002683 + - LL\n", | |
| "4 . chr1 3002505 chr1 3002699 + - LL\n", | |
| "... ... ... ... ... ... ... ... ...\n", | |
| "65220629 . chrY 2902279 chrY 2902428 + - LL\n", | |
| "65220630 . chrY 2902282 chrY 2902487 + - LL\n", | |
| "65220631 . chrY 2902286 chrY 2902430 + - LL\n", | |
| "65220632 . chrY 2902375 chrY 2902555 + - LL\n", | |
| "65220633 . chrY 2902405 chrY 2902560 + - LL\n", | |
| "\n", | |
| "[65220634 rows x 8 columns]" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time df.compute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "9e866592", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pairs_concat=df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "a331f83f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "#pairs_concat=pairs_concat.compute()\n", | |
| "#pairs_concat" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "f074accb", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Custom sorting function to sort chromosomes\n", | |
| "def chromosome_sort_key(chrom):\n", | |
| " if chrom == 'chrX':\n", | |
| " return 100\n", | |
| " elif chrom == 'chrY':\n", | |
| " return 101\n", | |
| " elif chrom == 'chrM':\n", | |
| " return 102\n", | |
| " else:\n", | |
| " return int(chrom[3:])\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "id": "4c0f077b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 3.3 ms, sys: 1.08 ms, total: 4.38 ms\n", | |
| "Wall time: 3.81 ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ADDED SORT PART\n", | |
| "\n", | |
| "# Variant 1: chrom1 and chrom2 1,10,2\n", | |
| "# Sort Time+Compute=7m 50s; to parquet=21s\n", | |
| "# Sort: 0s, Compute+to parquet: 14m\n", | |
| "#sorted_df = pairs_concat.sort_values(by=['chrom1', 'chrom2', 'pos1', 'pos2', 'strand1', 'strand2'])\n", | |
| "\n", | |
| "# Variant 2:chrom1 and chrom2 1,2,10\n", | |
| "%time sorted_df = pairs_concat.map_partitions(lambda df: df.assign(chrom1_key=df['chrom1'].map(chromosome_sort_key)).assign(chrom2_key=df['chrom2'].map(chromosome_sort_key)).sort_values(by=['chrom1_key', 'chrom2_key', 'pos1', 'pos2', 'strand1', 'strand2']).drop(columns='chrom2_key').drop(columns='chrom1_key'), meta={'read_id':'str','chrom1': 'str', 'pos1':int, 'chrom2': 'str','pos2':int,'strand1': 'str', 'strand2': 'str', 'pair_type':'str'})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "id": "cbdc3013", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 5min 25s, sys: 2min 3s, total: 7min 28s\n", | |
| "Wall time: 7min 28s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "#Each partition will be written to a separate file. => Not what we are looking for\n", | |
| "\"\"\" \n", | |
| "path : string or pathlib.Path\n", | |
| " Destination directory for data. Prepend with protocol like ``s3://``\n", | |
| " or ``hdfs://`` for remote data.\n", | |
| "compression : string or dict, default 'snappy'\n", | |
| " Either a string like ``\"snappy\"`` or a dictionary mapping column names\n", | |
| " to compressors like ``{\"name\": \"gzip\", \"values\": \"snappy\"}``. Defaults\n", | |
| " to ``\"snappy\"``.\n", | |
| "compute : bool, default True\n", | |
| " If ``True`` (default) then the result is computed immediately. If\n", | |
| " ``False`` then a ``dask.dataframe.Scalar`` object is returned for\n", | |
| " future computation.\n", | |
| "schema : pyarrow.Schema, dict, \"infer\", or None, default \"infer\"\n", | |
| " Global schema to use for the output dataset. Defaults to \"infer\", which\n", | |
| " will infer the schema from the dask dataframe metadata. This is usually\n", | |
| " sufficient for common schemas, but notably will fail for ``object``\n", | |
| " dtype columns that contain things other than strings. These columns\n", | |
| " will require an explicit schema be specified. The schema for a subset\n", | |
| " of columns can be overridden by passing in a dict of column names to\n", | |
| " pyarrow types (for example ``schema={\"field\": pa.string()}``); columns\n", | |
| " not present in this dict will still be automatically inferred.\n", | |
| " Alternatively, a full ``pyarrow.Schema`` may be passed, in which case\n", | |
| " no schema inference will be done. Passing in ``schema=None`` will\n", | |
| " disable the use of a global file schema - each written file may use a\n", | |
| " different schema dependent on the dtypes of the corresponding\n", | |
| " partition.\n", | |
| "\"\"\"\n", | |
| "\n", | |
| "%time out = sorted_df.to_parquet('test.parquet', compression='snappy', compute = True) " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "f15833ff", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div><strong>Dask DataFrame Structure:</strong></div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>read_id</th>\n", | |
| " <th>chrom1</th>\n", | |
| " <th>pos1</th>\n", | |
| " <th>chrom2</th>\n", | |
| " <th>pos2</th>\n", | |
| " <th>strand1</th>\n", | |
| " <th>strand2</th>\n", | |
| " <th>pair_type</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>npartitions=7</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th></th>\n", | |
| " <td>string</td>\n", | |
| " <td>string</td>\n", | |
| " <td>Int64</td>\n", | |
| " <td>string</td>\n", | |
| " <td>Int64</td>\n", | |
| " <td>string</td>\n", | |
| " <td>string</td>\n", | |
| " <td>string</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th></th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th></th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th></th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<div>Dask Name: read_parquet, 1 expression</div>" | |
| ], | |
| "text/plain": [ | |
| "Dask DataFrame Structure:\n", | |
| " read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n", | |
| "npartitions=7 \n", | |
| " string string Int64 string Int64 string string string\n", | |
| " ... ... ... ... ... ... ... ...\n", | |
| "... ... ... ... ... ... ... ... ...\n", | |
| " ... ... ... ... ... ... ... ...\n", | |
| " ... ... ... ... ... ... ... ...\n", | |
| "Dask Name: read_parquet, 1 expression\n", | |
| "Expr=ReadParquetFSSpec(f66a268)" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time df = dd.read_parquet('test.parquet')\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "id": "a9811098", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 8.29 ms, sys: 2.43 ms, total: 10.7 ms\n", | |
| "Wall time: 11.7 ms\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>read_id</th>\n", | |
| " <th>chrom1</th>\n", | |
| " <th>pos1</th>\n", | |
| " <th>chrom2</th>\n", | |
| " <th>pos2</th>\n", | |
| " <th>strand1</th>\n", | |
| " <th>strand2</th>\n", | |
| " <th>pair_type</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002468</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>7419565</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002505</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002639</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002505</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002644</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002505</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002683</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002505</td>\n", | |
| " <td>chr1</td>\n", | |
| " <td>3002699</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220629</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902279</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902428</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220630</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902282</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902487</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220631</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902286</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902430</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220632</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902375</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902555</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>65220633</th>\n", | |
| " <td>.</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902405</td>\n", | |
| " <td>chrY</td>\n", | |
| " <td>2902560</td>\n", | |
| " <td>+</td>\n", | |
| " <td>-</td>\n", | |
| " <td>LL</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>65220634 rows × 8 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n", | |
| "0 . chr1 3002468 chr1 7419565 + - LL\n", | |
| "1 . chr1 3002505 chr1 3002639 + - LL\n", | |
| "2 . chr1 3002505 chr1 3002644 + - LL\n", | |
| "3 . chr1 3002505 chr1 3002683 + - LL\n", | |
| "4 . chr1 3002505 chr1 3002699 + - LL\n", | |
| "... ... ... ... ... ... ... ... ...\n", | |
| "65220629 . chrY 2902279 chrY 2902428 + - LL\n", | |
| "65220630 . chrY 2902282 chrY 2902487 + - LL\n", | |
| "65220631 . chrY 2902286 chrY 2902430 + - LL\n", | |
| "65220632 . chrY 2902375 chrY 2902555 + - LL\n", | |
| "65220633 . chrY 2902405 chrY 2902560 + - LL\n", | |
| "\n", | |
| "[65220634 rows x 8 columns]" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time df.compute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "0e6ca625", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " chromosomes chrom_sort_key\n", | |
| "0 chr1 1\n", | |
| "6 chr2 2\n", | |
| "11 chr3 3\n", | |
| "12 chr4 4\n", | |
| "2 chr5 5\n", | |
| "3 chr6 6\n", | |
| "7 chr7 7\n", | |
| "8 chr8 8\n", | |
| "9 chr9 9\n", | |
| "10 chr10 10\n", | |
| "13 chr11 11\n", | |
| "14 chr12 12\n", | |
| "15 chr13 13\n", | |
| "16 chr14 14\n", | |
| "17 chr15 15\n", | |
| "18 chr16 16\n", | |
| "19 chr17 17\n", | |
| "20 chr18 18\n", | |
| "4 chr19 19\n", | |
| "1 chrX 100\n", | |
| "5 chrM 102\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "#TEST BLOCK\n", | |
| "\n", | |
| "data = {\n", | |
| " 'chromosomes': ['chr1', 'chrX', 'chr5', 'chr6', 'chr19', 'chrM', \n", | |
| " 'chr2', 'chr7', 'chr8', 'chr9', 'chr10', 'chr3', \n", | |
| " 'chr4', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', \n", | |
| " 'chr16', 'chr17', 'chr18']\n", | |
| "}\n", | |
| "\n", | |
| "pdf = pd.DataFrame(data)\n", | |
| "df = dd.from_pandas(pdf)\n", | |
| "# Sort the DataFrame by the 'chromosomes' column\n", | |
| "sorted_df = df.map_partitions(lambda df: df.assign(chrom_sort_key=df['chromosomes'].map(chromosome_sort_key))\n", | |
| " .sort_values('chrom_sort_key'),\n", | |
| " meta={'chromosomes': 'object', 'chrom_sort_key': int})\n", | |
| "\n", | |
| "# Compute to get the sorted result\n", | |
| "result = sorted_df.compute()\n", | |
| "\n", | |
| "print(result)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "main", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.19" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment