Created
March 11, 2025 12:25
-
-
Save gnzsnz/048f8c2749f0c73716c69138c157adea to your computer and use it in GitHub Desktop.
Xopen lz4 benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "a5965252-2f15-46bd-8829-17a998a6e41b", | |
| "metadata": {}, | |
| "source": [ | |
| "# LZ4 benchmark" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "e54f54ab-e867-47fb-a1ad-8620de04c123", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "import subprocess\n", | |
| "import time\n", | |
| "from pathlib import Path\n", | |
| "\n", | |
| "import lz4.frame\n", | |
| "import numpy as np" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "79317802-27a6-4c40-9bdf-79f62c23e631", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# directories\n", | |
| "OUTPUT_DIR = Path(\"test_files\")\n", | |
| "COMPRESS_DIR = Path(\"compress_files\")\n", | |
| "DECOMPRESS_DIR = Path(\"decompress_files\")\n", | |
| "\n", | |
| "# create\n", | |
| "Path(OUTPUT_DIR).mkdir(exist_ok=True)\n", | |
| "Path(COMPRESS_DIR).mkdir(exist_ok=True)\n", | |
| "Path(DECOMPRESS_DIR).mkdir(exist_ok=True)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "9c63b164-0a08-481b-8101-44ce6aa12ef5", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Configuration\n", | |
| "SIZES = [\n", | |
| " 1024,\n", | |
| " 1024 * 10,\n", | |
| " 1024 * 100,\n", | |
| " 1024 * 1024,\n", | |
| " 1024 * 1024 * 10,\n", | |
| " 1024 * 1024 * 100,\n", | |
| "] # 1KB to 10MB\n", | |
| "RUNS_PER_TEST = 100 # Number of runs for each test\n", | |
| "\n", | |
| "\n", | |
| "def generate_test_file(size: int):\n", | |
| " \"\"\"Generate a test file with random data\"\"\"\n", | |
| " test_file = OUTPUT_DIR / f\"test_{size}.bin\"\n", | |
| "\n", | |
| " with open(test_file, \"wb\") as f:\n", | |
| " f.write(os.urandom(size))\n", | |
| " return test_file\n", | |
| "\n", | |
| "\n", | |
| "def generate_compressed_files(data_dir: Path, output_dir: Path):\n", | |
| " for file in data_dir.iterdir():\n", | |
| " # compress file\n", | |
| " lz4_filename = output_dir / (file.name + \".lz4\")\n", | |
| " # Read data for Python test\n", | |
| " with open(file, \"rb\") as f:\n", | |
| " data = f.read()\n", | |
| " with lz4.frame.open(\n", | |
| " lz4_filename, \"wb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n", | |
| " ) as cf:\n", | |
| " cf.write(data)\n", | |
| "\n", | |
| "\n", | |
| "def benchmark_python_lz4(filename: Path, runs: int):\n", | |
| " \"\"\"Benchmark Python lz4 compression\"\"\"\n", | |
| " times = []\n", | |
| " lz4_filename: Path = filename.with_suffix(\".lz4\")\n", | |
| " for _ in range(runs):\n", | |
| " start_time = time.perf_counter()\n", | |
| " # Read data for Python test\n", | |
| " with open(filename, \"rb\") as f:\n", | |
| " data = f.read()\n", | |
| " with lz4.frame.open(\n", | |
| " lz4_filename, \"wb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n", | |
| " ) as cf:\n", | |
| " cf.write(data)\n", | |
| " end_time = time.perf_counter()\n", | |
| " times.append(end_time - start_time)\n", | |
| " # Clean up compressed file\n", | |
| " if lz4_filename.exists():\n", | |
| " os.remove(lz4_filename)\n", | |
| " return times\n", | |
| "\n", | |
| "\n", | |
| "def benchmark_os_lz4(input_file: Path, runs: int):\n", | |
| " \"\"\"Benchmark OS lz4 command-line tool\"\"\"\n", | |
| " output_file = input_file.with_suffix(\".lz4\")\n", | |
| " times = []\n", | |
| "\n", | |
| " for _ in range(runs):\n", | |
| " start_time = time.perf_counter()\n", | |
| " subprocess.run(\n", | |
| " [\"lz4\", \"-f\", \"-1\", input_file, output_file],\n", | |
| " stdout=subprocess.DEVNULL,\n", | |
| " stderr=subprocess.DEVNULL,\n", | |
| " )\n", | |
| " end_time = time.perf_counter()\n", | |
| " times.append(end_time - start_time)\n", | |
| "\n", | |
| " # Clean up compressed file\n", | |
| " if output_file.exists():\n", | |
| " os.remove(output_file)\n", | |
| "\n", | |
| " return times\n", | |
| "\n", | |
| "\n", | |
| "def format_size(size):\n", | |
| " \"\"\"Format size in human-readable format\"\"\"\n", | |
| " for unit in [\"B\", \"KB\", \"MB\"]:\n", | |
| " if size < 1024:\n", | |
| " return f\"{size:.1f}{unit}\"\n", | |
| " size /= 1024" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "e7c35b25-2f2e-442e-bb2b-aaec002ad2d4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def main_compress():\n", | |
| " print(\"Starting LZ4 compression benchmark...\")\n", | |
| " print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n", | |
| " print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n", | |
| "\n", | |
| " results = {\"python_lz4\": {}, \"os_lz4\": {}}\n", | |
| "\n", | |
| " # Run benchmarks\n", | |
| " for size in SIZES:\n", | |
| " # Generate test file\n", | |
| " filename = generate_test_file(size)\n", | |
| "\n", | |
| " # Benchmark Python lz4\n", | |
| " results[\"python_lz4\"][size] = benchmark_python_lz4(filename, RUNS_PER_TEST)\n", | |
| "\n", | |
| " # Benchmark OS lz4\n", | |
| " os_times = benchmark_os_lz4(filename, RUNS_PER_TEST)\n", | |
| " results[\"os_lz4\"][size] = os_times\n", | |
| "\n", | |
| " # Clean up\n", | |
| " # os.remove(filename)\n", | |
| "\n", | |
| " # Print results\n", | |
| " print(\"Results Compression (times in seconds):\")\n", | |
| " print(\"-\" * 80)\n", | |
| " print(f\"{'Size':>8} | {'Python LZ4 (avg ± std)':>25} | {'OS LZ4 (avg ± std)':>25}\")\n", | |
| " print(\"-\" * 80)\n", | |
| "\n", | |
| " for size in SIZES:\n", | |
| " py_avg = np.mean(results[\"python_lz4\"][size])\n", | |
| " py_std = np.std(results[\"python_lz4\"][size])\n", | |
| " os_avg = np.mean(results[\"os_lz4\"][size])\n", | |
| " os_std = np.std(results[\"os_lz4\"][size])\n", | |
| "\n", | |
| " print(\n", | |
| " f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n", | |
| " )\n", | |
| "\n", | |
| " print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "e875019d-5816-43d6-9ecb-c9501682ac02", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Starting LZ4 compression benchmark...\n", | |
| "Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n", | |
| "Runs per test: 100\n", | |
| "\n", | |
| "Results Compression (times in seconds):\n", | |
| "--------------------------------------------------------------------------------\n", | |
| " Size | Python LZ4 (avg ± std) | OS LZ4 (avg ± std)\n", | |
| "--------------------------------------------------------------------------------\n", | |
| " 1.0KB | 0.000222 ± 0.000135 | 0.002831 ± 0.000895\n", | |
| " 10.0KB | 0.000153 ± 0.000044 | 0.002343 ± 0.000167\n", | |
| " 100.0KB | 0.000140 ± 0.000072 | 0.002443 ± 0.000308\n", | |
| " 1.0MB | 0.000696 ± 0.000833 | 0.003184 ± 0.000342\n", | |
| " 10.0MB | 0.006929 ± 0.002354 | 0.008113 ± 0.000971\n", | |
| " 100.0MB | 0.084589 ± 0.057809 | 0.056928 ± 0.062384\n", | |
| "\n", | |
| "Note: Lower times are better. Results show mean ± standard deviation.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "main_compress()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "081453b8-03f5-4fb2-ba1d-70f96f513100", | |
| "metadata": {}, | |
| "source": [ | |
| "## Decompress benchmark" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "d4d5f832-4a1a-4384-84ef-386844940e4f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# generate compressed files to test decompression\n", | |
| "generate_compressed_files(OUTPUT_DIR, COMPRESS_DIR)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "8b814769-9f99-4df7-9b59-46e96c1c04df", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\n", | |
| "def benchmark_python_lz4_decompress(filename: Path, runs: int):\n", | |
| " \"\"\"Benchmark Python lz4 compression\"\"\"\n", | |
| " times = []\n", | |
| " uc_filename: Path = DECOMPRESS_DIR / (filename.with_suffix(\".bin\").name)\n", | |
| " for _ in range(runs):\n", | |
| " start_time = time.perf_counter()\n", | |
| " # Read data for Python test\n", | |
| " with lz4.frame.open(\n", | |
| " filename, \"rb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n", | |
| " ) as cf:\n", | |
| " data = cf.read()\n", | |
| " with open(uc_filename, \"wb\") as f:\n", | |
| " f.write(data)\n", | |
| " end_time = time.perf_counter()\n", | |
| " times.append(end_time - start_time)\n", | |
| " # Clean up compressed file\n", | |
| " if uc_filename.exists():\n", | |
| " os.remove(uc_filename)\n", | |
| " return times\n", | |
| "\n", | |
| "\n", | |
| "def benchmark_os_lz4_decompress(input_file: Path, runs: int):\n", | |
| " \"\"\"Benchmark OS lz4 command-line tool\"\"\"\n", | |
| " uc_filename: Path = DECOMPRESS_DIR / (input_file.with_suffix(\".bin\").name)\n", | |
| " times = []\n", | |
| " for _ in range(runs):\n", | |
| " start_time = time.perf_counter()\n", | |
| " subprocess.run(\n", | |
| " [\"lz4\", \"-f\", \"-1\", \"-d\", input_file, uc_filename],\n", | |
| " stdout=subprocess.DEVNULL,\n", | |
| " stderr=subprocess.DEVNULL,\n", | |
| " )\n", | |
| " end_time = time.perf_counter()\n", | |
| " times.append(end_time - start_time)\n", | |
| "\n", | |
| " # Clean up compressed file\n", | |
| " if uc_filename.exists():\n", | |
| " os.remove(uc_filename)\n", | |
| "\n", | |
| " return times\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "8c59b895-7076-4535-a159-69b5cdb588b6", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def main_decompress():\n", | |
| " print(\"Starting LZ4 compression benchmark...\")\n", | |
| " print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n", | |
| " print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n", | |
| "\n", | |
| " results = {\"python_lz4\": {}, \"os_lz4\": {}}\n", | |
| "\n", | |
| " # Run benchmarks\n", | |
| " for size,filename in zip(SIZES,COMPRESS_DIR.iterdir()):\n", | |
| "\n", | |
| " # Benchmark Python lz4\n", | |
| " results[\"python_lz4\"][size] = benchmark_python_lz4_decompress(\n", | |
| " filename, RUNS_PER_TEST\n", | |
| " )\n", | |
| "\n", | |
| " # Benchmark OS lz4\n", | |
| " results[\"os_lz4\"][size] = benchmark_os_lz4_decompress(filename, RUNS_PER_TEST)\n", | |
| "\n", | |
| " # Print results\n", | |
| " print(\"Results Decompression (times in seconds):\")\n", | |
| " print(\"-\" * 80)\n", | |
| " print(f\"{'Size':>8} | {'Python LZ4 (avg ± std)':>25} | {'OS LZ4 (avg ± std)':>25}\")\n", | |
| " print(\"-\" * 80)\n", | |
| "\n", | |
| " for size in SIZES:\n", | |
| " py_avg = np.mean(results[\"python_lz4\"][size])\n", | |
| " py_std = np.std(results[\"python_lz4\"][size])\n", | |
| " os_avg = np.mean(results[\"os_lz4\"][size])\n", | |
| " os_std = np.std(results[\"os_lz4\"][size])\n", | |
| "\n", | |
| " print(\n", | |
| " f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n", | |
| " )\n", | |
| "\n", | |
| " print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "8ded2ab1-c81d-4166-8cfc-0849ca5a8086", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Starting LZ4 compression benchmark...\n", | |
| "Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n", | |
| "Runs per test: 100\n", | |
| "\n", | |
| "Results Decompression (times in seconds):\n", | |
| "--------------------------------------------------------------------------------\n", | |
| " Size | Python LZ4 (avg ± std) | OS LZ4 (avg ± std)\n", | |
| "--------------------------------------------------------------------------------\n", | |
| " 1.0KB | 0.103372 ± 0.002786 | 0.051380 ± 0.045575\n", | |
| " 10.0KB | 0.000817 ± 0.000073 | 0.005209 ± 0.016507\n", | |
| " 100.0KB | 0.000167 ± 0.000027 | 0.002811 ± 0.000130\n", | |
| " 1.0MB | 0.000096 ± 0.000019 | 0.002655 ± 0.000044\n", | |
| " 10.0MB | 0.010892 ± 0.000622 | 0.008884 ± 0.003092\n", | |
| " 100.0MB | 0.000113 ± 0.000028 | 0.002750 ± 0.000164\n", | |
| "\n", | |
| "Note: Lower times are better. Results show mean ± standard deviation.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "main_decompress()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "9fcd8022-9cf3-4dbd-a96f-1d9d36ce9205", | |
| "metadata": {}, | |
| "source": [ | |
| "## Xopen" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "63cd7b25-db29-4310-8d84-fa4d3876adb3", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import xopen" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "id": "aaccaf23-4dd3-4fce-8a15-782e61785914", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\n", | |
| "def benchmark_python_lz4_xopen(filename: Path, runs: int):\n", | |
| " \"\"\"Benchmark Python lz4 compression\"\"\"\n", | |
| " times = []\n", | |
| " lz4_filename: Path = filename.with_suffix(\".lz4\")\n", | |
| " for _ in range(runs):\n", | |
| " start_time = time.perf_counter()\n", | |
| " # Read data for Python test\n", | |
| " with open(filename, \"rb\") as f:\n", | |
| " data = f.read()\n", | |
| " with xopen.xopen(\n", | |
| " lz4_filename, \"wb\", compresslevel=0,threads=0\n", | |
| " ) as cf:\n", | |
| " cf.write(data)\n", | |
| " end_time = time.perf_counter()\n", | |
| " times.append(end_time - start_time)\n", | |
| " # Clean up compressed file\n", | |
| " if lz4_filename.exists():\n", | |
| " os.remove(lz4_filename)\n", | |
| " return times\n", | |
| "\n", | |
| "\n", | |
| "def benchmark_python_lz4_xopen_nb(filename: Path, runs: int):\n", | |
| " \"\"\"Benchmark Python lz4 compression\"\"\"\n", | |
| " times = []\n", | |
| " lz4_filename: Path = filename.with_suffix(\".lz4\")\n", | |
| " for _ in range(runs):\n", | |
| " start_time = time.perf_counter()\n", | |
| " # Read data for Python test\n", | |
| " with open(filename, \"rb\") as f:\n", | |
| " data = f.read()\n", | |
| " with xopen.xopen_nb(\n", | |
| " lz4_filename, \"wb\", compresslevel=0,threads=0\n", | |
| " ) as cf:\n", | |
| " cf.write(data)\n", | |
| " end_time = time.perf_counter()\n", | |
| " times.append(end_time - start_time)\n", | |
| " # Clean up compressed file\n", | |
| " if lz4_filename.exists():\n", | |
| " os.remove(lz4_filename)\n", | |
| " return times\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "id": "79856da8-a75f-489e-a99d-7d098ca00b4d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def main_xopen():\n", | |
| " print(\"Starting LZ4 compression benchmark...\")\n", | |
| " print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n", | |
| " print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n", | |
| "\n", | |
| " results = {\"xopen\": {}, \"xopen_nb\": {}}\n", | |
| "\n", | |
| " # Run benchmarks\n", | |
| " for size in SIZES:\n", | |
| " # Generate test file\n", | |
| " filename = generate_test_file(size)\n", | |
| "\n", | |
| " # Benchmark Python lz4\n", | |
| " results[\"xopen\"][size] = benchmark_python_lz4_xopen(filename, RUNS_PER_TEST)\n", | |
| "\n", | |
| " # Benchmark OS lz4\n", | |
| " os_times = benchmark_python_lz4_xopen_nb(filename, RUNS_PER_TEST)\n", | |
| " results[\"xopen_nb\"][size] = os_times\n", | |
| "\n", | |
| " # Clean up\n", | |
| " # os.remove(filename)\n", | |
| "\n", | |
| " # Print results\n", | |
| " print(\"Results Compression (times in seconds):\")\n", | |
| " print(\"-\" * 80)\n", | |
| " print(f\"{'Size':>8} | {'xopen (avg ± std)':>25} | {'xopn no buffer (avg ± std)':>25}\")\n", | |
| " print(\"-\" * 80)\n", | |
| "\n", | |
| " for size in SIZES:\n", | |
| " py_avg = np.mean(results[\"xopen\"][size])\n", | |
| " py_std = np.std(results[\"xopen\"][size])\n", | |
| " os_avg = np.mean(results[\"xopen_nb\"][size])\n", | |
| " os_std = np.std(results[\"xopen_nb\"][size])\n", | |
| "\n", | |
| " print(\n", | |
| " f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n", | |
| " )\n", | |
| "\n", | |
| " print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "id": "508a936d-aef3-4850-bdf0-8d49c228fb15", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Starting LZ4 compression benchmark...\n", | |
| "Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n", | |
| "Runs per test: 100\n", | |
| "\n", | |
| "Results Compression (times in seconds):\n", | |
| "--------------------------------------------------------------------------------\n", | |
| " Size | xopen (avg ± std) | xopn no buffer (avg ± std)\n", | |
| "--------------------------------------------------------------------------------\n", | |
| " 1.0KB | 0.000191 ± 0.000079 | 0.000105 ± 0.000020\n", | |
| " 10.0KB | 0.000164 ± 0.000049 | 0.000168 ± 0.000130\n", | |
| " 100.0KB | 0.000216 ± 0.000092 | 0.000157 ± 0.000015\n", | |
| " 1.0MB | 0.000719 ± 0.000501 | 0.000596 ± 0.000468\n", | |
| " 10.0MB | 0.006572 ± 0.002621 | 0.005854 ± 0.001389\n", | |
| " 100.0MB | 0.074405 ± 0.018944 | 0.074006 ± 0.020324\n", | |
| "\n", | |
| "Note: Lower times are better. Results show mean ± standard deviation.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "main_xopen()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "7c20e1ec-1b23-4eb6-9086-62c8d91f4af5", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment