cabecada/gist:ea2ad57765eda06c74f188cee73865d7

## gistfile1.txt

Zero-Copy Architecture (The mmap Advantage)
In a standard program, reading a file involves "Context Switching":

The Kernel reads the data from the disk into Kernel Space.

The Kernel then copies that data into your program's User Space buffer.

Your CPU finally looks at the data.

With mmap() (Memory Mapping), the file is mapped directly into your process's address space.
The program treats the file like a giant array in RAM.
The OS "pages in" the data directly where the CPU can see it, skipping the extra copy step.
For 24TB, skipping that copy saves a massive amount of CPU cycles.

also
posix_madvise: This tells the Linux kernel: "I'm going to read this file start-to-finish."
The kernel will then pre-fetch data from the RAID into your 384GB RAM ahead of the program, keeping your 1GB/s pipe full.

now the code:

postgres@ubuntu:/tmp$ cat ~/find_zeros.c
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <string.h>

#define PG_BLOCK_SIZE 8192

// Global zero buffer for comparison
unsigned char ZERO_BLOCK[PG_BLOCK_SIZE] = {0};

void scan_file(const char *filename) {
    int fd = open(filename, O_RDONLY);
    if (fd < 0) return;

    struct stat st;
    if (fstat(fd, &st) < 0 || st.st_size == 0) {
        close(fd);
        return;
    }

    // Map the file into memory
    unsigned char *map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (map == MAP_FAILED) {
        close(fd);
        return;
    }

    // Advice the kernel we are reading sequentially to trigger aggressive readahead
    posix_madvise(map, st.st_size, POSIX_MADV_SEQUENTIAL);

    for (size_t offset = 0; offset < st.st_size; offset += PG_BLOCK_SIZE) {
        // Ensure we don't read past end of file for small/odd files
        size_t remaining = st.st_size - offset;
        size_t check_size = (remaining < PG_BLOCK_SIZE) ? remaining : PG_BLOCK_SIZE;

        // memcmp is optimized with SIMD (AVX/SSE) instructions
        if (memcmp(map + offset, ZERO_BLOCK, check_size) == 0) {
            printf("ZERO_BLOCK_FOUND|%s|Offset:%zu|Block:%zu\n",
                    filename, offset, offset / PG_BLOCK_SIZE);
        }
    }

    munmap(map, st.st_size);
    close(fd);
}

int main(int argc, char *argv[]) {
    if (argc < 2) return 1;
    scan_file(argv[1]);
    return 0;
}

-------------------------------------------------
gcc -O3 find_zeros.c -o find_zeros
--------------------------------------------------


postgres@ubuntu:/tmp$ pg_ctl -D db1 -l logfile start
waiting for server to start.... done
server started
postgres@ubuntu:/tmp$ psql
psql (17.5)
Type "help" for help.

postgres=# \di
             List of relations
 Schema |  Name  | Type  |  Owner   | Table
--------+--------+-------+----------+-------
 public | t_pkey | index | postgres | t
(1 row)

postgres=# select pg_relation_filepath('t_pkey');
 pg_relation_filepath
----------------------
 base/5/16387
(1 row)

postgres=# \q

#to mock the scenario, i just zero 1 block
postgres@ubuntu:/tmp$ dd if=/dev/zero of=db1/base/5/16387 seek=10 bs=8k count=1 oflag=direct conv=noerror,notrunc
1+0 records in
1+0 records out
8192 bytes (8.2 kB, 8.0 KiB) copied, 0.000306195 s, 26.8 MB/s

postgres@ubuntu:/tmp$ pg_ctl -D db1 -l logfile stop
waiting for server to shut down.... done
server stopped

postgres@ubuntu:/tmp$ find db1/base -type f | egrep -v 'vm|fsm' | xargs -I% ./find_zeros %
ZERO_BLOCK_FOUND|db1/base/5/16387|Offset:81920|Block:10

	Zero-Copy Architecture (The mmap Advantage)
	In a standard program, reading a file involves "Context Switching":

	The Kernel reads the data from the disk into Kernel Space.

	The Kernel then copies that data into your program's User Space buffer.

	Your CPU finally looks at the data.

	With mmap() (Memory Mapping), the file is mapped directly into your process's address space.
	The program treats the file like a giant array in RAM.
	The OS "pages in" the data directly where the CPU can see it, skipping the extra copy step.
	For 24TB, skipping that copy saves a massive amount of CPU cycles.

	also
	posix_madvise: This tells the Linux kernel: "I'm going to read this file start-to-finish."
	The kernel will then pre-fetch data from the RAID into your 384GB RAM ahead of the program, keeping your 1GB/s pipe full.

	now the code:

	postgres@ubuntu:/tmp$ cat ~/find_zeros.c
	#include <stdio.h>
	#include <stdlib.h>
	#include <fcntl.h>
	#include <unistd.h>
	#include <sys/mman.h>
	#include <sys/stat.h>
	#include <string.h>

	#define PG_BLOCK_SIZE 8192

	// Global zero buffer for comparison
	unsigned char ZERO_BLOCK[PG_BLOCK_SIZE] = {0};

	void scan_file(const char *filename) {
	int fd = open(filename, O_RDONLY);
	if (fd < 0) return;

	struct stat st;
	if (fstat(fd, &st) < 0 \|\| st.st_size == 0) {
	close(fd);
	return;
	}

	// Map the file into memory
	unsigned char *map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
	if (map == MAP_FAILED) {
	close(fd);
	return;
	}

	// Advice the kernel we are reading sequentially to trigger aggressive readahead
	posix_madvise(map, st.st_size, POSIX_MADV_SEQUENTIAL);

	for (size_t offset = 0; offset < st.st_size; offset += PG_BLOCK_SIZE) {
	// Ensure we don't read past end of file for small/odd files
	size_t remaining = st.st_size - offset;
	size_t check_size = (remaining < PG_BLOCK_SIZE) ? remaining : PG_BLOCK_SIZE;

	// memcmp is optimized with SIMD (AVX/SSE) instructions
	if (memcmp(map + offset, ZERO_BLOCK, check_size) == 0) {
	printf("ZERO_BLOCK_FOUND\|%s\|Offset:%zu\|Block:%zu\n",
	filename, offset, offset / PG_BLOCK_SIZE);
	}
	}

	munmap(map, st.st_size);
	close(fd);
	}

	int main(int argc, char *argv[]) {
	if (argc < 2) return 1;
	scan_file(argv[1]);
	return 0;
	}

	-------------------------------------------------
	gcc -O3 find_zeros.c -o find_zeros
	--------------------------------------------------


	postgres@ubuntu:/tmp$ pg_ctl -D db1 -l logfile start
	waiting for server to start.... done
	server started
	postgres@ubuntu:/tmp$ psql
	psql (17.5)
	Type "help" for help.

	postgres=# \di
	List of relations
	Schema \| Name \| Type \| Owner \| Table
	--------+--------+-------+----------+-------
	public \| t_pkey \| index \| postgres \| t
	(1 row)

	postgres=# select pg_relation_filepath('t_pkey');
	pg_relation_filepath
	----------------------
	base/5/16387
	(1 row)

	postgres=# \q

	#to mock the scenario, i just zero 1 block
	postgres@ubuntu:/tmp$ dd if=/dev/zero of=db1/base/5/16387 seek=10 bs=8k count=1 oflag=direct conv=noerror,notrunc
	1+0 records in
	1+0 records out
	8192 bytes (8.2 kB, 8.0 KiB) copied, 0.000306195 s, 26.8 MB/s

	postgres@ubuntu:/tmp$ pg_ctl -D db1 -l logfile stop
	waiting for server to shut down.... done
	server stopped

	postgres@ubuntu:/tmp$ find db1/base -type f \| egrep -v 'vm\|fsm' \| xargs -I% ./find_zeros %
	ZERO_BLOCK_FOUND\|db1/base/5/16387\|Offset:81920\|Block:10
No results found