gnzsnz/lz4_bench.ipynb

## lz4_bench.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a5965252-2f15-46bd-8829-17a998a6e41b",
   "metadata": {},
   "source": [
    "# LZ4 benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e54f54ab-e867-47fb-a1ad-8620de04c123",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import subprocess\n",
    "import time\n",
    "from pathlib import Path\n",
    "\n",
    "import lz4.frame\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "79317802-27a6-4c40-9bdf-79f62c23e631",
   "metadata": {},
   "outputs": [],
   "source": [
    "# directories\n",
    "OUTPUT_DIR = Path(\"test_files\")\n",
    "COMPRESS_DIR = Path(\"compress_files\")\n",
    "DECOMPRESS_DIR = Path(\"decompress_files\")\n",
    "\n",
    "# create\n",
    "Path(OUTPUT_DIR).mkdir(exist_ok=True)\n",
    "Path(COMPRESS_DIR).mkdir(exist_ok=True)\n",
    "Path(DECOMPRESS_DIR).mkdir(exist_ok=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9c63b164-0a08-481b-8101-44ce6aa12ef5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configuration\n",
    "SIZES = [\n",
    "    1024,\n",
    "    1024 * 10,\n",
    "    1024 * 100,\n",
    "    1024 * 1024,\n",
    "    1024 * 1024 * 10,\n",
    "    1024 * 1024 * 100,\n",
    "]  # 1KB to 10MB\n",
    "RUNS_PER_TEST = 100  # Number of runs for each test\n",
    "\n",
    "\n",
    "def generate_test_file(size: int):\n",
    "    \"\"\"Generate a test file with random data\"\"\"\n",
    "    test_file = OUTPUT_DIR / f\"test_{size}.bin\"\n",
    "\n",
    "    with open(test_file, \"wb\") as f:\n",
    "        f.write(os.urandom(size))\n",
    "    return test_file\n",
    "\n",
    "\n",
    "def generate_compressed_files(data_dir: Path, output_dir: Path):\n",
    "    for file in data_dir.iterdir():\n",
    "        # compress file\n",
    "        lz4_filename = output_dir / (file.name + \".lz4\")\n",
    "        # Read data for Python test\n",
    "        with open(file, \"rb\") as f:\n",
    "            data = f.read()\n",
    "        with lz4.frame.open(\n",
    "            lz4_filename, \"wb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
    "        ) as cf:\n",
    "            cf.write(data)\n",
    "\n",
    "\n",
    "def benchmark_python_lz4(filename: Path, runs: int):\n",
    "    \"\"\"Benchmark Python lz4 compression\"\"\"\n",
    "    times = []\n",
    "    lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
    "    for _ in range(runs):\n",
    "        start_time = time.perf_counter()\n",
    "        # Read data for Python test\n",
    "        with open(filename, \"rb\") as f:\n",
    "            data = f.read()\n",
    "        with lz4.frame.open(\n",
    "            lz4_filename, \"wb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
    "        ) as cf:\n",
    "            cf.write(data)\n",
    "        end_time = time.perf_counter()\n",
    "        times.append(end_time - start_time)\n",
    "        # Clean up compressed file\n",
    "        if lz4_filename.exists():\n",
    "            os.remove(lz4_filename)\n",
    "    return times\n",
    "\n",
    "\n",
    "def benchmark_os_lz4(input_file: Path, runs: int):\n",
    "    \"\"\"Benchmark OS lz4 command-line tool\"\"\"\n",
    "    output_file = input_file.with_suffix(\".lz4\")\n",
    "    times = []\n",
    "\n",
    "    for _ in range(runs):\n",
    "        start_time = time.perf_counter()\n",
    "        subprocess.run(\n",
    "            [\"lz4\", \"-f\", \"-1\", input_file, output_file],\n",
    "            stdout=subprocess.DEVNULL,\n",
    "            stderr=subprocess.DEVNULL,\n",
    "        )\n",
    "        end_time = time.perf_counter()\n",
    "        times.append(end_time - start_time)\n",
    "\n",
    "        # Clean up compressed file\n",
    "        if output_file.exists():\n",
    "            os.remove(output_file)\n",
    "\n",
    "    return times\n",
    "\n",
    "\n",
    "def format_size(size):\n",
    "    \"\"\"Format size in human-readable format\"\"\"\n",
    "    for unit in [\"B\", \"KB\", \"MB\"]:\n",
    "        if size < 1024:\n",
    "            return f\"{size:.1f}{unit}\"\n",
    "        size /= 1024"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e7c35b25-2f2e-442e-bb2b-aaec002ad2d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def main_compress():\n",
    "    print(\"Starting LZ4 compression benchmark...\")\n",
    "    print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
    "    print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
    "\n",
    "    results = {\"python_lz4\": {}, \"os_lz4\": {}}\n",
    "\n",
    "    # Run benchmarks\n",
    "    for size in SIZES:\n",
    "        # Generate test file\n",
    "        filename = generate_test_file(size)\n",
    "\n",
    "        # Benchmark Python lz4\n",
    "        results[\"python_lz4\"][size] = benchmark_python_lz4(filename, RUNS_PER_TEST)\n",
    "\n",
    "        # Benchmark OS lz4\n",
    "        os_times = benchmark_os_lz4(filename, RUNS_PER_TEST)\n",
    "        results[\"os_lz4\"][size] = os_times\n",
    "\n",
    "        # Clean up\n",
    "        # os.remove(filename)\n",
    "\n",
    "    # Print results\n",
    "    print(\"Results Compression (times in seconds):\")\n",
    "    print(\"-\" * 80)\n",
    "    print(f\"{'Size':>8} | {'Python LZ4 (avg ± std)':>25} | {'OS LZ4 (avg ± std)':>25}\")\n",
    "    print(\"-\" * 80)\n",
    "\n",
    "    for size in SIZES:\n",
    "        py_avg = np.mean(results[\"python_lz4\"][size])\n",
    "        py_std = np.std(results[\"python_lz4\"][size])\n",
    "        os_avg = np.mean(results[\"os_lz4\"][size])\n",
    "        os_std = np.std(results[\"os_lz4\"][size])\n",
    "\n",
    "        print(\n",
    "            f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n",
    "        )\n",
    "\n",
    "    print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e875019d-5816-43d6-9ecb-c9501682ac02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting LZ4 compression benchmark...\n",
      "Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
      "Runs per test: 100\n",
      "\n",
      "Results Compression (times in seconds):\n",
      "--------------------------------------------------------------------------------\n",
      "    Size |    Python LZ4 (avg ± std) |        OS LZ4 (avg ± std)\n",
      "--------------------------------------------------------------------------------\n",
      "   1.0KB | 0.000222 ± 0.000135 | 0.002831 ± 0.000895\n",
      "  10.0KB | 0.000153 ± 0.000044 | 0.002343 ± 0.000167\n",
      " 100.0KB | 0.000140 ± 0.000072 | 0.002443 ± 0.000308\n",
      "   1.0MB | 0.000696 ± 0.000833 | 0.003184 ± 0.000342\n",
      "  10.0MB | 0.006929 ± 0.002354 | 0.008113 ± 0.000971\n",
      " 100.0MB | 0.084589 ± 0.057809 | 0.056928 ± 0.062384\n",
      "\n",
      "Note: Lower times are better. Results show mean ± standard deviation.\n"
     ]
    }
   ],
   "source": [
    "main_compress()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "081453b8-03f5-4fb2-ba1d-70f96f513100",
   "metadata": {},
   "source": [
    "## Decompress benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d4d5f832-4a1a-4384-84ef-386844940e4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate compressed files to test decompression\n",
    "generate_compressed_files(OUTPUT_DIR, COMPRESS_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8b814769-9f99-4df7-9b59-46e96c1c04df",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def benchmark_python_lz4_decompress(filename: Path, runs: int):\n",
    "    \"\"\"Benchmark Python lz4 compression\"\"\"\n",
    "    times = []\n",
    "    uc_filename: Path = DECOMPRESS_DIR / (filename.with_suffix(\".bin\").name)\n",
    "    for _ in range(runs):\n",
    "        start_time = time.perf_counter()\n",
    "        # Read data for Python test\n",
    "        with lz4.frame.open(\n",
    "            filename, \"rb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
    "        ) as cf:\n",
    "            data = cf.read()\n",
    "        with open(uc_filename, \"wb\") as f:\n",
    "            f.write(data)\n",
    "        end_time = time.perf_counter()\n",
    "        times.append(end_time - start_time)\n",
    "        # Clean up compressed file\n",
    "        if uc_filename.exists():\n",
    "            os.remove(uc_filename)\n",
    "    return times\n",
    "\n",
    "\n",
    "def benchmark_os_lz4_decompress(input_file: Path, runs: int):\n",
    "    \"\"\"Benchmark OS lz4 command-line tool\"\"\"\n",
    "    uc_filename: Path = DECOMPRESS_DIR / (input_file.with_suffix(\".bin\").name)\n",
    "    times = []\n",
    "    for _ in range(runs):\n",
    "        start_time = time.perf_counter()\n",
    "        subprocess.run(\n",
    "            [\"lz4\", \"-f\", \"-1\", \"-d\", input_file, uc_filename],\n",
    "            stdout=subprocess.DEVNULL,\n",
    "            stderr=subprocess.DEVNULL,\n",
    "        )\n",
    "        end_time = time.perf_counter()\n",
    "        times.append(end_time - start_time)\n",
    "\n",
    "        # Clean up compressed file\n",
    "        if uc_filename.exists():\n",
    "            os.remove(uc_filename)\n",
    "\n",
    "    return times\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8c59b895-7076-4535-a159-69b5cdb588b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def main_decompress():\n",
    "    print(\"Starting LZ4 compression benchmark...\")\n",
    "    print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
    "    print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
    "\n",
    "    results = {\"python_lz4\": {}, \"os_lz4\": {}}\n",
    "\n",
    "    # Run benchmarks\n",
    "    for size,filename in zip(SIZES,COMPRESS_DIR.iterdir()):\n",
    "\n",
    "        # Benchmark Python lz4\n",
    "        results[\"python_lz4\"][size] = benchmark_python_lz4_decompress(\n",
    "            filename, RUNS_PER_TEST\n",
    "        )\n",
    "\n",
    "        # Benchmark OS lz4\n",
    "        results[\"os_lz4\"][size] = benchmark_os_lz4_decompress(filename, RUNS_PER_TEST)\n",
    "\n",
    "    # Print results\n",
    "    print(\"Results Decompression (times in seconds):\")\n",
    "    print(\"-\" * 80)\n",
    "    print(f\"{'Size':>8} | {'Python LZ4 (avg ± std)':>25} | {'OS LZ4 (avg ± std)':>25}\")\n",
    "    print(\"-\" * 80)\n",
    "\n",
    "    for size in SIZES:\n",
    "        py_avg = np.mean(results[\"python_lz4\"][size])\n",
    "        py_std = np.std(results[\"python_lz4\"][size])\n",
    "        os_avg = np.mean(results[\"os_lz4\"][size])\n",
    "        os_std = np.std(results[\"os_lz4\"][size])\n",
    "\n",
    "        print(\n",
    "            f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n",
    "        )\n",
    "\n",
    "    print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8ded2ab1-c81d-4166-8cfc-0849ca5a8086",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting LZ4 compression benchmark...\n",
      "Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
      "Runs per test: 100\n",
      "\n",
      "Results Decompression (times in seconds):\n",
      "--------------------------------------------------------------------------------\n",
      "    Size |    Python LZ4 (avg ± std) |        OS LZ4 (avg ± std)\n",
      "--------------------------------------------------------------------------------\n",
      "   1.0KB | 0.103372 ± 0.002786 | 0.051380 ± 0.045575\n",
      "  10.0KB | 0.000817 ± 0.000073 | 0.005209 ± 0.016507\n",
      " 100.0KB | 0.000167 ± 0.000027 | 0.002811 ± 0.000130\n",
      "   1.0MB | 0.000096 ± 0.000019 | 0.002655 ± 0.000044\n",
      "  10.0MB | 0.010892 ± 0.000622 | 0.008884 ± 0.003092\n",
      " 100.0MB | 0.000113 ± 0.000028 | 0.002750 ± 0.000164\n",
      "\n",
      "Note: Lower times are better. Results show mean ± standard deviation.\n"
     ]
    }
   ],
   "source": [
    "main_decompress()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9fcd8022-9cf3-4dbd-a96f-1d9d36ce9205",
   "metadata": {},
   "source": [
    "## Xopen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "63cd7b25-db29-4310-8d84-fa4d3876adb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import xopen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "aaccaf23-4dd3-4fce-8a15-782e61785914",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def benchmark_python_lz4_xopen(filename: Path, runs: int):\n",
    "    \"\"\"Benchmark Python lz4 compression\"\"\"\n",
    "    times = []\n",
    "    lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
    "    for _ in range(runs):\n",
    "        start_time = time.perf_counter()\n",
    "        # Read data for Python test\n",
    "        with open(filename, \"rb\") as f:\n",
    "            data = f.read()\n",
    "        with xopen.xopen(\n",
    "            lz4_filename, \"wb\", compresslevel=0,threads=0\n",
    "        ) as cf:\n",
    "            cf.write(data)\n",
    "        end_time = time.perf_counter()\n",
    "        times.append(end_time - start_time)\n",
    "        # Clean up compressed file\n",
    "        if lz4_filename.exists():\n",
    "            os.remove(lz4_filename)\n",
    "    return times\n",
    "\n",
    "\n",
    "def benchmark_python_lz4_xopen_nb(filename: Path, runs: int):\n",
    "    \"\"\"Benchmark Python lz4 compression\"\"\"\n",
    "    times = []\n",
    "    lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
    "    for _ in range(runs):\n",
    "        start_time = time.perf_counter()\n",
    "        # Read data for Python test\n",
    "        with open(filename, \"rb\") as f:\n",
    "            data = f.read()\n",
    "        with xopen.xopen_nb(\n",
    "            lz4_filename, \"wb\", compresslevel=0,threads=0\n",
    "        ) as cf:\n",
    "            cf.write(data)\n",
    "        end_time = time.perf_counter()\n",
    "        times.append(end_time - start_time)\n",
    "        # Clean up compressed file\n",
    "        if lz4_filename.exists():\n",
    "            os.remove(lz4_filename)\n",
    "    return times\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "79856da8-a75f-489e-a99d-7d098ca00b4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def main_xopen():\n",
    "    print(\"Starting LZ4 compression benchmark...\")\n",
    "    print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
    "    print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
    "\n",
    "    results = {\"xopen\": {}, \"xopen_nb\": {}}\n",
    "\n",
    "    # Run benchmarks\n",
    "    for size in SIZES:\n",
    "        # Generate test file\n",
    "        filename = generate_test_file(size)\n",
    "\n",
    "        # Benchmark Python lz4\n",
    "        results[\"xopen\"][size] = benchmark_python_lz4_xopen(filename, RUNS_PER_TEST)\n",
    "\n",
    "        # Benchmark OS lz4\n",
    "        os_times = benchmark_python_lz4_xopen_nb(filename, RUNS_PER_TEST)\n",
    "        results[\"xopen_nb\"][size] = os_times\n",
    "\n",
    "        # Clean up\n",
    "        # os.remove(filename)\n",
    "\n",
    "    # Print results\n",
    "    print(\"Results Compression (times in seconds):\")\n",
    "    print(\"-\" * 80)\n",
    "    print(f\"{'Size':>8} | {'xopen (avg ± std)':>25} | {'xopn no buffer (avg ± std)':>25}\")\n",
    "    print(\"-\" * 80)\n",
    "\n",
    "    for size in SIZES:\n",
    "        py_avg = np.mean(results[\"xopen\"][size])\n",
    "        py_std = np.std(results[\"xopen\"][size])\n",
    "        os_avg = np.mean(results[\"xopen_nb\"][size])\n",
    "        os_std = np.std(results[\"xopen_nb\"][size])\n",
    "\n",
    "        print(\n",
    "            f\"{format_size(size):>8} | {py_avg:.6f} ± {py_std:.6f} | {os_avg:.6f} ± {os_std:.6f}\"\n",
    "        )\n",
    "\n",
    "    print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "508a936d-aef3-4850-bdf0-8d49c228fb15",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting LZ4 compression benchmark...\n",
      "Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
      "Runs per test: 100\n",
      "\n",
      "Results Compression (times in seconds):\n",
      "--------------------------------------------------------------------------------\n",
      "    Size |         xopen (avg ± std) | xopn no buffer (avg ± std)\n",
      "--------------------------------------------------------------------------------\n",
      "   1.0KB | 0.000191 ± 0.000079 | 0.000105 ± 0.000020\n",
      "  10.0KB | 0.000164 ± 0.000049 | 0.000168 ± 0.000130\n",
      " 100.0KB | 0.000216 ± 0.000092 | 0.000157 ± 0.000015\n",
      "   1.0MB | 0.000719 ± 0.000501 | 0.000596 ± 0.000468\n",
      "  10.0MB | 0.006572 ± 0.002621 | 0.005854 ± 0.001389\n",
      " 100.0MB | 0.074405 ± 0.018944 | 0.074006 ± 0.020324\n",
      "\n",
      "Note: Lower times are better. Results show mean ± standard deviation.\n"
     ]
    }
   ],
   "source": [
    "main_xopen()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c20e1ec-1b23-4eb6-9086-62c8d91f4af5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "a5965252-2f15-46bd-8829-17a998a6e41b",
	"metadata": {},
	"source": [
	"# LZ4 benchmark"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "e54f54ab-e867-47fb-a1ad-8620de04c123",
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import subprocess\n",
	"import time\n",
	"from pathlib import Path\n",
	"\n",
	"import lz4.frame\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "79317802-27a6-4c40-9bdf-79f62c23e631",
	"metadata": {},
	"outputs": [],
	"source": [
	"# directories\n",
	"OUTPUT_DIR = Path(\"test_files\")\n",
	"COMPRESS_DIR = Path(\"compress_files\")\n",
	"DECOMPRESS_DIR = Path(\"decompress_files\")\n",
	"\n",
	"# create\n",
	"Path(OUTPUT_DIR).mkdir(exist_ok=True)\n",
	"Path(COMPRESS_DIR).mkdir(exist_ok=True)\n",
	"Path(DECOMPRESS_DIR).mkdir(exist_ok=True)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "9c63b164-0a08-481b-8101-44ce6aa12ef5",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Configuration\n",
	"SIZES = [\n",
	" 1024,\n",
	" 1024 * 10,\n",
	" 1024 * 100,\n",
	" 1024 * 1024,\n",
	" 1024 * 1024 * 10,\n",
	" 1024 * 1024 * 100,\n",
	"] # 1KB to 10MB\n",
	"RUNS_PER_TEST = 100 # Number of runs for each test\n",
	"\n",
	"\n",
	"def generate_test_file(size: int):\n",
	" \"\"\"Generate a test file with random data\"\"\"\n",
	" test_file = OUTPUT_DIR / f\"test_{size}.bin\"\n",
	"\n",
	" with open(test_file, \"wb\") as f:\n",
	" f.write(os.urandom(size))\n",
	" return test_file\n",
	"\n",
	"\n",
	"def generate_compressed_files(data_dir: Path, output_dir: Path):\n",
	" for file in data_dir.iterdir():\n",
	" # compress file\n",
	" lz4_filename = output_dir / (file.name + \".lz4\")\n",
	" # Read data for Python test\n",
	" with open(file, \"rb\") as f:\n",
	" data = f.read()\n",
	" with lz4.frame.open(\n",
	" lz4_filename, \"wb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
	" ) as cf:\n",
	" cf.write(data)\n",
	"\n",
	"\n",
	"def benchmark_python_lz4(filename: Path, runs: int):\n",
	" \"\"\"Benchmark Python lz4 compression\"\"\"\n",
	" times = []\n",
	" lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
	" for _ in range(runs):\n",
	" start_time = time.perf_counter()\n",
	" # Read data for Python test\n",
	" with open(filename, \"rb\") as f:\n",
	" data = f.read()\n",
	" with lz4.frame.open(\n",
	" lz4_filename, \"wb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
	" ) as cf:\n",
	" cf.write(data)\n",
	" end_time = time.perf_counter()\n",
	" times.append(end_time - start_time)\n",
	" # Clean up compressed file\n",
	" if lz4_filename.exists():\n",
	" os.remove(lz4_filename)\n",
	" return times\n",
	"\n",
	"\n",
	"def benchmark_os_lz4(input_file: Path, runs: int):\n",
	" \"\"\"Benchmark OS lz4 command-line tool\"\"\"\n",
	" output_file = input_file.with_suffix(\".lz4\")\n",
	" times = []\n",
	"\n",
	" for _ in range(runs):\n",
	" start_time = time.perf_counter()\n",
	" subprocess.run(\n",
	" [\"lz4\", \"-f\", \"-1\", input_file, output_file],\n",
	" stdout=subprocess.DEVNULL,\n",
	" stderr=subprocess.DEVNULL,\n",
	" )\n",
	" end_time = time.perf_counter()\n",
	" times.append(end_time - start_time)\n",
	"\n",
	" # Clean up compressed file\n",
	" if output_file.exists():\n",
	" os.remove(output_file)\n",
	"\n",
	" return times\n",
	"\n",
	"\n",
	"def format_size(size):\n",
	" \"\"\"Format size in human-readable format\"\"\"\n",
	" for unit in [\"B\", \"KB\", \"MB\"]:\n",
	" if size < 1024:\n",
	" return f\"{size:.1f}{unit}\"\n",
	" size /= 1024"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "e7c35b25-2f2e-442e-bb2b-aaec002ad2d4",
	"metadata": {},
	"outputs": [],
	"source": [
	"def main_compress():\n",
	" print(\"Starting LZ4 compression benchmark...\")\n",
	" print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
	" print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
	"\n",
	" results = {\"python_lz4\": {}, \"os_lz4\": {}}\n",
	"\n",
	" # Run benchmarks\n",
	" for size in SIZES:\n",
	" # Generate test file\n",
	" filename = generate_test_file(size)\n",
	"\n",
	" # Benchmark Python lz4\n",
	" results[\"python_lz4\"][size] = benchmark_python_lz4(filename, RUNS_PER_TEST)\n",
	"\n",
	" # Benchmark OS lz4\n",
	" os_times = benchmark_os_lz4(filename, RUNS_PER_TEST)\n",
	" results[\"os_lz4\"][size] = os_times\n",
	"\n",
	" # Clean up\n",
	" # os.remove(filename)\n",
	"\n",
	" # Print results\n",
	" print(\"Results Compression (times in seconds):\")\n",
	" print(\"-\" * 80)\n",
	" print(f\"{'Size':>8} \| {'Python LZ4 (avg ± std)':>25} \| {'OS LZ4 (avg ± std)':>25}\")\n",
	" print(\"-\" * 80)\n",
	"\n",
	" for size in SIZES:\n",
	" py_avg = np.mean(results[\"python_lz4\"][size])\n",
	" py_std = np.std(results[\"python_lz4\"][size])\n",
	" os_avg = np.mean(results[\"os_lz4\"][size])\n",
	" os_std = np.std(results[\"os_lz4\"][size])\n",
	"\n",
	" print(\n",
	" f\"{format_size(size):>8} \| {py_avg:.6f} ± {py_std:.6f} \| {os_avg:.6f} ± {os_std:.6f}\"\n",
	" )\n",
	"\n",
	" print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "e875019d-5816-43d6-9ecb-c9501682ac02",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Starting LZ4 compression benchmark...\n",
	"Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
	"Runs per test: 100\n",
	"\n",
	"Results Compression (times in seconds):\n",
	"--------------------------------------------------------------------------------\n",
	" Size \| Python LZ4 (avg ± std) \| OS LZ4 (avg ± std)\n",
	"--------------------------------------------------------------------------------\n",
	" 1.0KB \| 0.000222 ± 0.000135 \| 0.002831 ± 0.000895\n",
	" 10.0KB \| 0.000153 ± 0.000044 \| 0.002343 ± 0.000167\n",
	" 100.0KB \| 0.000140 ± 0.000072 \| 0.002443 ± 0.000308\n",
	" 1.0MB \| 0.000696 ± 0.000833 \| 0.003184 ± 0.000342\n",
	" 10.0MB \| 0.006929 ± 0.002354 \| 0.008113 ± 0.000971\n",
	" 100.0MB \| 0.084589 ± 0.057809 \| 0.056928 ± 0.062384\n",
	"\n",
	"Note: Lower times are better. Results show mean ± standard deviation.\n"
	]
	}
	],
	"source": [
	"main_compress()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "081453b8-03f5-4fb2-ba1d-70f96f513100",
	"metadata": {},
	"source": [
	"## Decompress benchmark"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "d4d5f832-4a1a-4384-84ef-386844940e4f",
	"metadata": {},
	"outputs": [],
	"source": [
	"# generate compressed files to test decompression\n",
	"generate_compressed_files(OUTPUT_DIR, COMPRESS_DIR)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "8b814769-9f99-4df7-9b59-46e96c1c04df",
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"def benchmark_python_lz4_decompress(filename: Path, runs: int):\n",
	" \"\"\"Benchmark Python lz4 compression\"\"\"\n",
	" times = []\n",
	" uc_filename: Path = DECOMPRESS_DIR / (filename.with_suffix(\".bin\").name)\n",
	" for _ in range(runs):\n",
	" start_time = time.perf_counter()\n",
	" # Read data for Python test\n",
	" with lz4.frame.open(\n",
	" filename, \"rb\", compression_level=lz4.frame.COMPRESSIONLEVEL_MIN\n",
	" ) as cf:\n",
	" data = cf.read()\n",
	" with open(uc_filename, \"wb\") as f:\n",
	" f.write(data)\n",
	" end_time = time.perf_counter()\n",
	" times.append(end_time - start_time)\n",
	" # Clean up compressed file\n",
	" if uc_filename.exists():\n",
	" os.remove(uc_filename)\n",
	" return times\n",
	"\n",
	"\n",
	"def benchmark_os_lz4_decompress(input_file: Path, runs: int):\n",
	" \"\"\"Benchmark OS lz4 command-line tool\"\"\"\n",
	" uc_filename: Path = DECOMPRESS_DIR / (input_file.with_suffix(\".bin\").name)\n",
	" times = []\n",
	" for _ in range(runs):\n",
	" start_time = time.perf_counter()\n",
	" subprocess.run(\n",
	" [\"lz4\", \"-f\", \"-1\", \"-d\", input_file, uc_filename],\n",
	" stdout=subprocess.DEVNULL,\n",
	" stderr=subprocess.DEVNULL,\n",
	" )\n",
	" end_time = time.perf_counter()\n",
	" times.append(end_time - start_time)\n",
	"\n",
	" # Clean up compressed file\n",
	" if uc_filename.exists():\n",
	" os.remove(uc_filename)\n",
	"\n",
	" return times\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "8c59b895-7076-4535-a159-69b5cdb588b6",
	"metadata": {},
	"outputs": [],
	"source": [
	"def main_decompress():\n",
	" print(\"Starting LZ4 compression benchmark...\")\n",
	" print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
	" print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
	"\n",
	" results = {\"python_lz4\": {}, \"os_lz4\": {}}\n",
	"\n",
	" # Run benchmarks\n",
	" for size,filename in zip(SIZES,COMPRESS_DIR.iterdir()):\n",
	"\n",
	" # Benchmark Python lz4\n",
	" results[\"python_lz4\"][size] = benchmark_python_lz4_decompress(\n",
	" filename, RUNS_PER_TEST\n",
	" )\n",
	"\n",
	" # Benchmark OS lz4\n",
	" results[\"os_lz4\"][size] = benchmark_os_lz4_decompress(filename, RUNS_PER_TEST)\n",
	"\n",
	" # Print results\n",
	" print(\"Results Decompression (times in seconds):\")\n",
	" print(\"-\" * 80)\n",
	" print(f\"{'Size':>8} \| {'Python LZ4 (avg ± std)':>25} \| {'OS LZ4 (avg ± std)':>25}\")\n",
	" print(\"-\" * 80)\n",
	"\n",
	" for size in SIZES:\n",
	" py_avg = np.mean(results[\"python_lz4\"][size])\n",
	" py_std = np.std(results[\"python_lz4\"][size])\n",
	" os_avg = np.mean(results[\"os_lz4\"][size])\n",
	" os_std = np.std(results[\"os_lz4\"][size])\n",
	"\n",
	" print(\n",
	" f\"{format_size(size):>8} \| {py_avg:.6f} ± {py_std:.6f} \| {os_avg:.6f} ± {os_std:.6f}\"\n",
	" )\n",
	"\n",
	" print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "8ded2ab1-c81d-4166-8cfc-0849ca5a8086",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Starting LZ4 compression benchmark...\n",
	"Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
	"Runs per test: 100\n",
	"\n",
	"Results Decompression (times in seconds):\n",
	"--------------------------------------------------------------------------------\n",
	" Size \| Python LZ4 (avg ± std) \| OS LZ4 (avg ± std)\n",
	"--------------------------------------------------------------------------------\n",
	" 1.0KB \| 0.103372 ± 0.002786 \| 0.051380 ± 0.045575\n",
	" 10.0KB \| 0.000817 ± 0.000073 \| 0.005209 ± 0.016507\n",
	" 100.0KB \| 0.000167 ± 0.000027 \| 0.002811 ± 0.000130\n",
	" 1.0MB \| 0.000096 ± 0.000019 \| 0.002655 ± 0.000044\n",
	" 10.0MB \| 0.010892 ± 0.000622 \| 0.008884 ± 0.003092\n",
	" 100.0MB \| 0.000113 ± 0.000028 \| 0.002750 ± 0.000164\n",
	"\n",
	"Note: Lower times are better. Results show mean ± standard deviation.\n"
	]
	}
	],
	"source": [
	"main_decompress()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "9fcd8022-9cf3-4dbd-a96f-1d9d36ce9205",
	"metadata": {},
	"source": [
	"## Xopen"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "63cd7b25-db29-4310-8d84-fa4d3876adb3",
	"metadata": {},
	"outputs": [],
	"source": [
	"import xopen"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"id": "aaccaf23-4dd3-4fce-8a15-782e61785914",
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"def benchmark_python_lz4_xopen(filename: Path, runs: int):\n",
	" \"\"\"Benchmark Python lz4 compression\"\"\"\n",
	" times = []\n",
	" lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
	" for _ in range(runs):\n",
	" start_time = time.perf_counter()\n",
	" # Read data for Python test\n",
	" with open(filename, \"rb\") as f:\n",
	" data = f.read()\n",
	" with xopen.xopen(\n",
	" lz4_filename, \"wb\", compresslevel=0,threads=0\n",
	" ) as cf:\n",
	" cf.write(data)\n",
	" end_time = time.perf_counter()\n",
	" times.append(end_time - start_time)\n",
	" # Clean up compressed file\n",
	" if lz4_filename.exists():\n",
	" os.remove(lz4_filename)\n",
	" return times\n",
	"\n",
	"\n",
	"def benchmark_python_lz4_xopen_nb(filename: Path, runs: int):\n",
	" \"\"\"Benchmark Python lz4 compression\"\"\"\n",
	" times = []\n",
	" lz4_filename: Path = filename.with_suffix(\".lz4\")\n",
	" for _ in range(runs):\n",
	" start_time = time.perf_counter()\n",
	" # Read data for Python test\n",
	" with open(filename, \"rb\") as f:\n",
	" data = f.read()\n",
	" with xopen.xopen_nb(\n",
	" lz4_filename, \"wb\", compresslevel=0,threads=0\n",
	" ) as cf:\n",
	" cf.write(data)\n",
	" end_time = time.perf_counter()\n",
	" times.append(end_time - start_time)\n",
	" # Clean up compressed file\n",
	" if lz4_filename.exists():\n",
	" os.remove(lz4_filename)\n",
	" return times\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"id": "79856da8-a75f-489e-a99d-7d098ca00b4d",
	"metadata": {},
	"outputs": [],
	"source": [
	"def main_xopen():\n",
	" print(\"Starting LZ4 compression benchmark...\")\n",
	" print(f\"Testing sizes: {[format_size(s) for s in SIZES]}\")\n",
	" print(f\"Runs per test: {RUNS_PER_TEST}\\n\")\n",
	"\n",
	" results = {\"xopen\": {}, \"xopen_nb\": {}}\n",
	"\n",
	" # Run benchmarks\n",
	" for size in SIZES:\n",
	" # Generate test file\n",
	" filename = generate_test_file(size)\n",
	"\n",
	" # Benchmark Python lz4\n",
	" results[\"xopen\"][size] = benchmark_python_lz4_xopen(filename, RUNS_PER_TEST)\n",
	"\n",
	" # Benchmark OS lz4\n",
	" os_times = benchmark_python_lz4_xopen_nb(filename, RUNS_PER_TEST)\n",
	" results[\"xopen_nb\"][size] = os_times\n",
	"\n",
	" # Clean up\n",
	" # os.remove(filename)\n",
	"\n",
	" # Print results\n",
	" print(\"Results Compression (times in seconds):\")\n",
	" print(\"-\" * 80)\n",
	" print(f\"{'Size':>8} \| {'xopen (avg ± std)':>25} \| {'xopn no buffer (avg ± std)':>25}\")\n",
	" print(\"-\" * 80)\n",
	"\n",
	" for size in SIZES:\n",
	" py_avg = np.mean(results[\"xopen\"][size])\n",
	" py_std = np.std(results[\"xopen\"][size])\n",
	" os_avg = np.mean(results[\"xopen_nb\"][size])\n",
	" os_std = np.std(results[\"xopen_nb\"][size])\n",
	"\n",
	" print(\n",
	" f\"{format_size(size):>8} \| {py_avg:.6f} ± {py_std:.6f} \| {os_avg:.6f} ± {os_std:.6f}\"\n",
	" )\n",
	"\n",
	" print(\"\\nNote: Lower times are better. Results show mean ± standard deviation.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"id": "508a936d-aef3-4850-bdf0-8d49c228fb15",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Starting LZ4 compression benchmark...\n",
	"Testing sizes: ['1.0KB', '10.0KB', '100.0KB', '1.0MB', '10.0MB', '100.0MB']\n",
	"Runs per test: 100\n",
	"\n",
	"Results Compression (times in seconds):\n",
	"--------------------------------------------------------------------------------\n",
	" Size \| xopen (avg ± std) \| xopn no buffer (avg ± std)\n",
	"--------------------------------------------------------------------------------\n",
	" 1.0KB \| 0.000191 ± 0.000079 \| 0.000105 ± 0.000020\n",
	" 10.0KB \| 0.000164 ± 0.000049 \| 0.000168 ± 0.000130\n",
	" 100.0KB \| 0.000216 ± 0.000092 \| 0.000157 ± 0.000015\n",
	" 1.0MB \| 0.000719 ± 0.000501 \| 0.000596 ± 0.000468\n",
	" 10.0MB \| 0.006572 ± 0.002621 \| 0.005854 ± 0.001389\n",
	" 100.0MB \| 0.074405 ± 0.018944 \| 0.074006 ± 0.020324\n",
	"\n",
	"Note: Lower times are better. Results show mean ± standard deviation.\n"
	]
	}
	],
	"source": [
	"main_xopen()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "7c20e1ec-1b23-4eb6-9086-62c8d91f4af5",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found