Created
September 3, 2024 10:24
-
-
Save ayaksvals/5b013e092b413ad0d2543943a810a7cd to your computer and use it in GitHub Desktop.
read, sort, save .parquet file with DuckDB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import duckdb\n", | |
| "\n", | |
| "# Connect to DuckDB in-memory database\n", | |
| "con = duckdb.connect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 3min 59s, sys: 53.1 s, total: 4min 52s\n", | |
| "Wall time: 2min 33s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<duckdb.duckdb.DuckDBPyConnection at 0x2aed4b899ef0>" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "\n", | |
| "input_file = 'NIPBL_R1.parquet'\n", | |
| "output_file = 'NIPBL_R1_sorted_duckdb.parquet'\n", | |
| "schema = \"\"\"\n", | |
| " read_id: VARCHAR,\n", | |
| " chrom1: VARCHAR,\n", | |
| " pos1: BIGINT,\n", | |
| " chrom2: VARCHAR,\n", | |
| " pos2: BIGINT,\n", | |
| " strand1: VARCHAR,\n", | |
| " strand2: VARCHAR,\n", | |
| " pairs_type: VARCHAR\n", | |
| "\"\"\"\n", | |
| "con.execute(f\"\"\"\n", | |
| " COPY (\n", | |
| " SELECT CAST(read_id AS VARCHAR), CAST(chrom1 AS VARCHAR), \n", | |
| " CAST(pos1 AS BIGINT), CAST(chrom2 AS VARCHAR), \n", | |
| " CAST(pos2 AS BIGINT), CAST(strand1 AS VARCHAR),\n", | |
| " CAST(strand2 AS VARCHAR), CAST(pairs_type AS VARCHAR)\n", | |
| " FROM read_parquet('{input_file}')\n", | |
| " WHERE read_id NOT LIKE '#%' -- Filter out lines that start with '#samheader'\n", | |
| " ORDER BY chrom1, chrom2, pos1, pos2\n", | |
| " ) TO '{output_file}' (FORMAT PARQUET);\n", | |
| "\"\"\")\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "main", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.19" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment