Skip to content

Instantly share code, notes, and snippets.

@gnzsnz
Created March 11, 2025 12:25
Show Gist options
  • Select an option

  • Save gnzsnz/048f8c2749f0c73716c69138c157adea to your computer and use it in GitHub Desktop.

Select an option

Save gnzsnz/048f8c2749f0c73716c69138c157adea to your computer and use it in GitHub Desktop.
Xopen lz4 benchmark
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "a5965252-2f15-46bd-8829-17a998a6e41b",
"metadata": {},
"source": [
"# LZ4 benchmark"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e54f54ab-e867-47fb-a1ad-8620de04c123",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import subprocess\n",
"import time\n",
"from pathlib import Path\n",
"\n",
"import lz4.frame\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "79317802-27a6-4c40-9bdf-79f62c23e631",
"metadata": {},
"outputs": [],
"source": [
"# directories\n",
"OUTPUT_DIR = Path(\"test_files\")\n",
"COMPRESS_DIR = Path(\"compress_files\")\n",
"DECOMPRESS_DIR = Path(\"decompress_files\")\n",
"\n",
"# create\n",
"Path(OUTPUT_DIR).mkdir(exist_ok=True)\n",
"Path(COMPRESS_DIR).mkdir(exist_ok=True)\n",
"Path(DECOMPRESS_DIR).mkdir(exist_ok=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9c63b164-0a08-481b-8101-44ce6aa12ef5",
"metadata": {},
"outputs": [],
"source": [
"# Configuration\n",
"SIZES = [\n",
" 1024,\n",
" 1024 * 10,\n",
" 1024 * 100,\n",
" 1024 * 1024,\n",
" 1024 * 1024 * 10,\n",
" 1024 * 1024 * 100,\n",
"] # 1KB to 10MB\n",
"RUNS_PER_TEST = 100 # Number of runs for each test\n",
"\n",
"\n",
"def generate_test_file(size: int):\n",
" \"\"\"Generate a test file with random data\"\"\"\n",
" test_file = OUTPUT_DIR / f\"test_{size}.bin\"\n",
"\n",
" with open(test_file, \"wb\") as f:\n",
" f.write(os.urandom(size))\n",
" return test_file\n",
"\n",
"\n",
"def generate_compressed_files(data_dir: Path, output_dir: Path):\n",
" for file in data_dir.iterdir():\n",
" # compress file\n",
" lz4_filename = output_dir / (file.name + \".lz4\")\n",
" # Read data for Python test\n",
" with open(file, \"rb\") as f:\n",
" data = f.read()\n",
" with lz4.frame.open(\n",
" lz4_filename, \"wb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
" ) as cf:\n",
" cf.write(data)\n",
"\n",
"\n",
"def benchmark_python_lz4(filename: Path, runs: int):\n",
" \"\"\"Benchmark Python lz4 compression\"\"\"\n",
" times = []\n",
" lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
" for _ in range(runs):\n",
" start_time = time.perf_counter()\n",
" # Read data for Python test\n",
" with open(filename, \"rb\") as f:\n",
" data = f.read()\n",
" with lz4.frame.open(\n",
" lz4_filename, \"wb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
" ) as cf:\n",
" cf.write(data)\n",
" end_time = time.perf_counter()\n",
" times.append(end_time - start_time)\n",
" # Clean up compressed file\n",
" if lz4_filename.exists():\n",
" os.remove(lz4_filename)\n",
" return times\n",
"\n",
"\n",
"def benchmark_os_lz4(input_file: Path, runs: int):\n",
" \"\"\"Benchmark OS lz4 command-line tool\"\"\"\n",
" output_file = input_file.with_suffix(\".lz4\")\n",
" times = []\n",
"\n",
" for _ in range(runs):\n",
" start_time = time.perf_counter()\n",
" subprocess.run(\n",
" [\"lz4\", \"-f\", \"-1\", input_file, output_file],\n",
" stdout=subprocess.DEVNULL,\n",
" stderr=subprocess.DEVNULL,\n",
" )\n",
" end_time = time.perf_counter()\n",
" times.append(end_time - start_time)\n",
"\n",
" # Clean up compressed file\n",
" if output_file.exists():\n",
" os.remove(output_file)\n",
"\n",
" return times\n",
"\n",
"\n",
"def format_size(size):\n",
" \"\"\"Format size in human-readable format\"\"\"\n",
" for unit in [\"B\", \"KB\", \"MB\"]:\n",
" if size < 1024:\n",
" return f\"{size:.1f}{unit}\"\n",
" size /= 1024"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e7c35b25-2f2e-442e-bb2b-aaec002ad2d4",
"metadata": {},
"outputs": [],
"source": [
"def main_compress():\n",
" print(\"Starting LZ4 compression benchmark...\")\n",
" print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
" print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
"\n",
" results = {\"python_lz4\": {}, \"os_lz4\": {}}\n",
"\n",
" # Run benchmarks\n",
" for size in SIZES:\n",
" # Generate test file\n",
" filename = generate_test_file(size)\n",
"\n",
" # Benchmark Python lz4\n",
" results[\"python_lz4\"][size] = benchmark_python_lz4(filename, RUNS_PER_TEST)\n",
"\n",
" # Benchmark OS lz4\n",
" os_times = benchmark_os_lz4(filename, RUNS_PER_TEST)\n",
" results[\"os_lz4\"][size] = os_times\n",
"\n",
" # Clean up\n",
" # os.remove(filename)\n",
"\n",
" # Print results\n",
" print(\"Results Compression (times in seconds):\")\n",
" print(\"-\" * 80)\n",
" print(f\"{'Size':>8} | {'Python LZ4 (avg ± std)':>25} | {'OS LZ4 (avg ± std)':>25}\")\n",
" print(\"-\" * 80)\n",
"\n",
" for size in SIZES:\n",
" py_avg = np.mean(results[\"python_lz4\"][size])\n",
" py_std = np.std(results[\"python_lz4\"][size])\n",
" os_avg = np.mean(results[\"os_lz4\"][size])\n",
" os_std = np.std(results[\"os_lz4\"][size])\n",
"\n",
" print(\n",
" f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n",
" )\n",
"\n",
" print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e875019d-5816-43d6-9ecb-c9501682ac02",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting LZ4 compression benchmark...\n",
"Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
"Runs per test: 100\n",
"\n",
"Results Compression (times in seconds):\n",
"--------------------------------------------------------------------------------\n",
" Size | Python LZ4 (avg ± std) | OS LZ4 (avg ± std)\n",
"--------------------------------------------------------------------------------\n",
" 1.0KB | 0.000222 ± 0.000135 | 0.002831 ± 0.000895\n",
" 10.0KB | 0.000153 ± 0.000044 | 0.002343 ± 0.000167\n",
" 100.0KB | 0.000140 ± 0.000072 | 0.002443 ± 0.000308\n",
" 1.0MB | 0.000696 ± 0.000833 | 0.003184 ± 0.000342\n",
" 10.0MB | 0.006929 ± 0.002354 | 0.008113 ± 0.000971\n",
" 100.0MB | 0.084589 ± 0.057809 | 0.056928 ± 0.062384\n",
"\n",
"Note: Lower times are better. Results show mean ± standard deviation.\n"
]
}
],
"source": [
"main_compress()"
]
},
{
"cell_type": "markdown",
"id": "081453b8-03f5-4fb2-ba1d-70f96f513100",
"metadata": {},
"source": [
"## Decompress benchmark"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d4d5f832-4a1a-4384-84ef-386844940e4f",
"metadata": {},
"outputs": [],
"source": [
"# generate compressed files to test decompression\n",
"generate_compressed_files(OUTPUT_DIR, COMPRESS_DIR)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8b814769-9f99-4df7-9b59-46e96c1c04df",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def benchmark_python_lz4_decompress(filename: Path, runs: int):\n",
" \"\"\"Benchmark Python lz4 compression\"\"\"\n",
" times = []\n",
" uc_filename: Path = DECOMPRESS_DIR / (filename.with_suffix(\".bin\").name)\n",
" for _ in range(runs):\n",
" start_time = time.perf_counter()\n",
" # Read data for Python test\n",
" with lz4.frame.open(\n",
" filename, \"rb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
" ) as cf:\n",
" data = cf.read()\n",
" with open(uc_filename, \"wb\") as f:\n",
" f.write(data)\n",
" end_time = time.perf_counter()\n",
" times.append(end_time - start_time)\n",
" # Clean up compressed file\n",
" if uc_filename.exists():\n",
" os.remove(uc_filename)\n",
" return times\n",
"\n",
"\n",
"def benchmark_os_lz4_decompress(input_file: Path, runs: int):\n",
" \"\"\"Benchmark OS lz4 command-line tool\"\"\"\n",
" uc_filename: Path = DECOMPRESS_DIR / (input_file.with_suffix(\".bin\").name)\n",
" times = []\n",
" for _ in range(runs):\n",
" start_time = time.perf_counter()\n",
" subprocess.run(\n",
" [\"lz4\", \"-f\", \"-1\", \"-d\", input_file, uc_filename],\n",
" stdout=subprocess.DEVNULL,\n",
" stderr=subprocess.DEVNULL,\n",
" )\n",
" end_time = time.perf_counter()\n",
" times.append(end_time - start_time)\n",
"\n",
" # Clean up compressed file\n",
" if uc_filename.exists():\n",
" os.remove(uc_filename)\n",
"\n",
" return times\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8c59b895-7076-4535-a159-69b5cdb588b6",
"metadata": {},
"outputs": [],
"source": [
"def main_decompress():\n",
" print(\"Starting LZ4 compression benchmark...\")\n",
" print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
" print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
"\n",
" results = {\"python_lz4\": {}, \"os_lz4\": {}}\n",
"\n",
" # Run benchmarks\n",
" for size,filename in zip(SIZES,COMPRESS_DIR.iterdir()):\n",
"\n",
" # Benchmark Python lz4\n",
" results[\"python_lz4\"][size] = benchmark_python_lz4_decompress(\n",
" filename, RUNS_PER_TEST\n",
" )\n",
"\n",
" # Benchmark OS lz4\n",
" results[\"os_lz4\"][size] = benchmark_os_lz4_decompress(filename, RUNS_PER_TEST)\n",
"\n",
" # Print results\n",
" print(\"Results Decompression (times in seconds):\")\n",
" print(\"-\" * 80)\n",
" print(f\"{'Size':>8} | {'Python LZ4 (avg ± std)':>25} | {'OS LZ4 (avg ± std)':>25}\")\n",
" print(\"-\" * 80)\n",
"\n",
" for size in SIZES:\n",
" py_avg = np.mean(results[\"python_lz4\"][size])\n",
" py_std = np.std(results[\"python_lz4\"][size])\n",
" os_avg = np.mean(results[\"os_lz4\"][size])\n",
" os_std = np.std(results[\"os_lz4\"][size])\n",
"\n",
" print(\n",
" f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n",
" )\n",
"\n",
" print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8ded2ab1-c81d-4166-8cfc-0849ca5a8086",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting LZ4 compression benchmark...\n",
"Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
"Runs per test: 100\n",
"\n",
"Results Decompression (times in seconds):\n",
"--------------------------------------------------------------------------------\n",
" Size | Python LZ4 (avg ± std) | OS LZ4 (avg ± std)\n",
"--------------------------------------------------------------------------------\n",
" 1.0KB | 0.103372 ± 0.002786 | 0.051380 ± 0.045575\n",
" 10.0KB | 0.000817 ± 0.000073 | 0.005209 ± 0.016507\n",
" 100.0KB | 0.000167 ± 0.000027 | 0.002811 ± 0.000130\n",
" 1.0MB | 0.000096 ± 0.000019 | 0.002655 ± 0.000044\n",
" 10.0MB | 0.010892 ± 0.000622 | 0.008884 ± 0.003092\n",
" 100.0MB | 0.000113 ± 0.000028 | 0.002750 ± 0.000164\n",
"\n",
"Note: Lower times are better. Results show mean ± standard deviation.\n"
]
}
],
"source": [
"main_decompress()"
]
},
{
"cell_type": "markdown",
"id": "9fcd8022-9cf3-4dbd-a96f-1d9d36ce9205",
"metadata": {},
"source": [
"## Xopen"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "63cd7b25-db29-4310-8d84-fa4d3876adb3",
"metadata": {},
"outputs": [],
"source": [
"import xopen"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "aaccaf23-4dd3-4fce-8a15-782e61785914",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def benchmark_python_lz4_xopen(filename: Path, runs: int):\n",
" \"\"\"Benchmark Python lz4 compression\"\"\"\n",
" times = []\n",
" lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
" for _ in range(runs):\n",
" start_time = time.perf_counter()\n",
" # Read data for Python test\n",
" with open(filename, \"rb\") as f:\n",
" data = f.read()\n",
" with xopen.xopen(\n",
" lz4_filename, \"wb\", compresslevel=0,threads=0\n",
" ) as cf:\n",
" cf.write(data)\n",
" end_time = time.perf_counter()\n",
" times.append(end_time - start_time)\n",
" # Clean up compressed file\n",
" if lz4_filename.exists():\n",
" os.remove(lz4_filename)\n",
" return times\n",
"\n",
"\n",
"def benchmark_python_lz4_xopen_nb(filename: Path, runs: int):\n",
" \"\"\"Benchmark Python lz4 compression\"\"\"\n",
" times = []\n",
" lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
" for _ in range(runs):\n",
" start_time = time.perf_counter()\n",
" # Read data for Python test\n",
" with open(filename, \"rb\") as f:\n",
" data = f.read()\n",
" with xopen.xopen_nb(\n",
" lz4_filename, \"wb\", compresslevel=0,threads=0\n",
" ) as cf:\n",
" cf.write(data)\n",
" end_time = time.perf_counter()\n",
" times.append(end_time - start_time)\n",
" # Clean up compressed file\n",
" if lz4_filename.exists():\n",
" os.remove(lz4_filename)\n",
" return times\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "79856da8-a75f-489e-a99d-7d098ca00b4d",
"metadata": {},
"outputs": [],
"source": [
"def main_xopen():\n",
" print(\"Starting LZ4 compression benchmark...\")\n",
" print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
" print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
"\n",
" results = {\"xopen\": {}, \"xopen_nb\": {}}\n",
"\n",
" # Run benchmarks\n",
" for size in SIZES:\n",
" # Generate test file\n",
" filename = generate_test_file(size)\n",
"\n",
" # Benchmark Python lz4\n",
" results[\"xopen\"][size] = benchmark_python_lz4_xopen(filename, RUNS_PER_TEST)\n",
"\n",
" # Benchmark OS lz4\n",
" os_times = benchmark_python_lz4_xopen_nb(filename, RUNS_PER_TEST)\n",
" results[\"xopen_nb\"][size] = os_times\n",
"\n",
" # Clean up\n",
" # os.remove(filename)\n",
"\n",
" # Print results\n",
" print(\"Results Compression (times in seconds):\")\n",
" print(\"-\" * 80)\n",
" print(f\"{'Size':>8} | {'xopen (avg ± std)':>25} | {'xopn no buffer (avg ± std)':>25}\")\n",
" print(\"-\" * 80)\n",
"\n",
" for size in SIZES:\n",
" py_avg = np.mean(results[\"xopen\"][size])\n",
" py_std = np.std(results[\"xopen\"][size])\n",
" os_avg = np.mean(results[\"xopen_nb\"][size])\n",
" os_std = np.std(results[\"xopen_nb\"][size])\n",
"\n",
" print(\n",
" f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n",
" )\n",
"\n",
" print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "508a936d-aef3-4850-bdf0-8d49c228fb15",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting LZ4 compression benchmark...\n",
"Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
"Runs per test: 100\n",
"\n",
"Results Compression (times in seconds):\n",
"--------------------------------------------------------------------------------\n",
" Size | xopen (avg ± std) | xopn no buffer (avg ± std)\n",
"--------------------------------------------------------------------------------\n",
" 1.0KB | 0.000191 ± 0.000079 | 0.000105 ± 0.000020\n",
" 10.0KB | 0.000164 ± 0.000049 | 0.000168 ± 0.000130\n",
" 100.0KB | 0.000216 ± 0.000092 | 0.000157 ± 0.000015\n",
" 1.0MB | 0.000719 ± 0.000501 | 0.000596 ± 0.000468\n",
" 10.0MB | 0.006572 ± 0.002621 | 0.005854 ± 0.001389\n",
" 100.0MB | 0.074405 ± 0.018944 | 0.074006 ± 0.020324\n",
"\n",
"Note: Lower times are better. Results show mean ± standard deviation.\n"
]
}
],
"source": [
"main_xopen()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c20e1ec-1b23-4eb6-9086-62c8d91f4af5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment