Created
January 12, 2026 09:24
-
-
Save cabecada/ea2ad57765eda06c74f188cee73865d7 to your computer and use it in GitHub Desktop.
zero block detection using mmap and posix_madvise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Zero-Copy Architecture (The mmap Advantage) | |
| In a standard program, reading a file involves "Context Switching": | |
| The Kernel reads the data from the disk into Kernel Space. | |
| The Kernel then copies that data into your program's User Space buffer. | |
| Your CPU finally looks at the data. | |
| With mmap() (Memory Mapping), the file is mapped directly into your process's address space. | |
| The program treats the file like a giant array in RAM. | |
| The OS "pages in" the data directly where the CPU can see it, skipping the extra copy step. | |
| For 24TB, skipping that copy saves a massive amount of CPU cycles. | |
| also | |
| posix_madvise: This tells the Linux kernel: "I'm going to read this file start-to-finish." | |
| The kernel will then pre-fetch data from the RAID into your 384GB RAM ahead of the program, keeping your 1GB/s pipe full. | |
| now the code: | |
| postgres@ubuntu:/tmp$ cat ~/find_zeros.c | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <fcntl.h> | |
| #include <unistd.h> | |
| #include <sys/mman.h> | |
| #include <sys/stat.h> | |
| #include <string.h> | |
| #define PG_BLOCK_SIZE 8192 | |
| // Global zero buffer for comparison | |
| unsigned char ZERO_BLOCK[PG_BLOCK_SIZE] = {0}; | |
| void scan_file(const char *filename) { | |
| int fd = open(filename, O_RDONLY); | |
| if (fd < 0) return; | |
| struct stat st; | |
| if (fstat(fd, &st) < 0 || st.st_size == 0) { | |
| close(fd); | |
| return; | |
| } | |
| // Map the file into memory | |
| unsigned char *map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); | |
| if (map == MAP_FAILED) { | |
| close(fd); | |
| return; | |
| } | |
| // Advice the kernel we are reading sequentially to trigger aggressive readahead | |
| posix_madvise(map, st.st_size, POSIX_MADV_SEQUENTIAL); | |
| for (size_t offset = 0; offset < st.st_size; offset += PG_BLOCK_SIZE) { | |
| // Ensure we don't read past end of file for small/odd files | |
| size_t remaining = st.st_size - offset; | |
| size_t check_size = (remaining < PG_BLOCK_SIZE) ? remaining : PG_BLOCK_SIZE; | |
| // memcmp is optimized with SIMD (AVX/SSE) instructions | |
| if (memcmp(map + offset, ZERO_BLOCK, check_size) == 0) { | |
| printf("ZERO_BLOCK_FOUND|%s|Offset:%zu|Block:%zu\n", | |
| filename, offset, offset / PG_BLOCK_SIZE); | |
| } | |
| } | |
| munmap(map, st.st_size); | |
| close(fd); | |
| } | |
| int main(int argc, char *argv[]) { | |
| if (argc < 2) return 1; | |
| scan_file(argv[1]); | |
| return 0; | |
| } | |
| ------------------------------------------------- | |
| gcc -O3 find_zeros.c -o find_zeros | |
| -------------------------------------------------- | |
| postgres@ubuntu:/tmp$ pg_ctl -D db1 -l logfile start | |
| waiting for server to start.... done | |
| server started | |
| postgres@ubuntu:/tmp$ psql | |
| psql (17.5) | |
| Type "help" for help. | |
| postgres=# \di | |
| List of relations | |
| Schema | Name | Type | Owner | Table | |
| --------+--------+-------+----------+------- | |
| public | t_pkey | index | postgres | t | |
| (1 row) | |
| postgres=# select pg_relation_filepath('t_pkey'); | |
| pg_relation_filepath | |
| ---------------------- | |
| base/5/16387 | |
| (1 row) | |
| postgres=# \q | |
| #to mock the scenario, i just zero 1 block | |
| postgres@ubuntu:/tmp$ dd if=/dev/zero of=db1/base/5/16387 seek=10 bs=8k count=1 oflag=direct conv=noerror,notrunc | |
| 1+0 records in | |
| 1+0 records out | |
| 8192 bytes (8.2 kB, 8.0 KiB) copied, 0.000306195 s, 26.8 MB/s | |
| postgres@ubuntu:/tmp$ pg_ctl -D db1 -l logfile stop | |
| waiting for server to shut down.... done | |
| server stopped | |
| postgres@ubuntu:/tmp$ find db1/base -type f | egrep -v 'vm|fsm' | xargs -I% ./find_zeros % | |
| ZERO_BLOCK_FOUND|db1/base/5/16387|Offset:81920|Block:10 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment