Last active
January 7, 2026 00:22
-
-
Save Hermann-SW/94241e719dd3518a3276d804b627319d to your computer and use it in GitHub Desktop.
Determine pi_{k^2+(k+1)^2}(10^n) (for all k<=10^n)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // NOLINT(legal/copyright) | |
| // g++ par.cc -O3 -Wall -pedantic -Wextra -o par -fopenmp | |
| // (cpplinted and cppchecked) | |
| // | |
| #include <string.h> | |
| #include <iostream> | |
| #include <cassert> | |
| #include <tuple> | |
| #include <queue> | |
| typedef __int128_t num; | |
| const int lev = 16; // results in ceil((fib(19)-1)/2) = 2090 work items | |
| num c; | |
| char *V; | |
| std::queue<std::tuple<num, num, bool, bool>> Q[2090]; | |
| int nxt = 0; | |
| const bool lft = false; | |
| const bool rgt = true; | |
| num other(num v, num p) { | |
| num x = 2*v*(v+1)+1; | |
| assert(x%p == 0); | |
| return x/p; | |
| } | |
| num next(num v, num p) { | |
| num r = v%p; | |
| return v + (2*r < p ? p-2*r-1 : 2*(p-r)-1); | |
| } | |
| void right(num v, num p, int l, bool b); | |
| void left(num v, num p, int l, bool b) { | |
| if (l == 0) { Q[nxt++].push({v, p, lft, b}); return; } | |
| V[v] = 1; | |
| num q = other(v, p); | |
| if (q != p) left(next(v, q), q, l-1, false); | |
| right(next(v, p), p, l-1, b); | |
| } | |
| void right(num v, num p, int l, bool b) { | |
| if (l == 0) { Q[nxt++].push({v, p, rgt, b}); return; } | |
| V[v] = 1; | |
| num q = other(v, p); | |
| left(next(v, p), p, l-1, false); | |
| if (b) right(next(v, q), q, l-1, true); | |
| } | |
| int main(int argc, char *argv[]) { | |
| assert(argc == 2); | |
| assert(atoi(argv[1]) >= 4); | |
| num mx = 1; | |
| for (int e = 1; e <= atoi(argv[1]); ++e) { | |
| mx *= 10; | |
| } | |
| V = new char[1 + mx]; | |
| assert(V); | |
| explicit_bzero(V, 1+mx); | |
| left(3, 5, lev, true); | |
| _Pragma("omp parallel for") | |
| for (int i = 0; i < 2090; ++i) { | |
| while (!Q[i].empty()) { | |
| num v = std::get<0>(Q[i].front()); | |
| num p = std::get<1>(Q[i].front()); | |
| bool l = std::get<2>(Q[i].front()); | |
| bool b = std::get<3>(Q[i].front()); | |
| Q[i].pop(); | |
| if (v > mx) continue; | |
| V[v] = 1; | |
| num q = other(v, p); | |
| if (l == lft) { | |
| if (q != p) Q[i].push({next(v, q), q, lft, false}); | |
| Q[i].push({next(v, p), p, rgt, b}); | |
| } else { | |
| Q[i].push({next(v, p), p, lft, false}); | |
| if (b) Q[i].push({next(v, q), q, rgt, true}); | |
| } | |
| } | |
| } // _Pragma("omp parallel for") | |
| num c = mx, i; | |
| for (i = 1; i <= mx; ++i) c-=V[i]; | |
| std::cout << static_cast<__int64_t>(c) << "\n"; | |
| return 0; | |
| } |
Author
Author
From previous comment, the first parallel version did take only 22min to determine a(10):
$ time numactl -C 0-15 ./par 10
614983774
real 22m40.991s
user 88m26.700s
sys 0m5.347s
$
Previous (sequential) computation of a(10) with PARI/GP took 23h on AMD 7950X CPU:
hermann@7950x:~$ gp -q
? a(n)=sum(k=1, 10^n, isprime(k^2+(k+1)^2));
? #
timer = 1 (on)
? a(10)
cpu time = 33h, 53min, 56,970 ms, real time = 23h, 3min, 55,987 ms.
614983774
?
Today I used parsum() instead of sum() on 192C/384T 8-socket server with slower Intel Xeon 8890v4 CPUS.
Reduction of runtime from 23h to 19 minutes ... wow:
hermann@x3950-X6:~$ numactl -C 0-191 gp -q
? a(n)=parsum(k=1, 10^n, isprime(k^2+(k+1)^2));
? #
timer = 1 (on)
? a(10)
cpu time = 59h, 57min, 42,370 ms, real time = 18min, 46,398 ms.
614983774
?
For numbers represented as 64bit long isprime() calls ispseudoprime() which is much faster than isprime() and is correct up to 2^64. Computation for 10^10 ks without ispseudoprime speedup takes 1h:
? parsum(k=10^10+1,2*10^10,isprime(k^2+(k+1)^2));
cpu time = 191h, 45min, 10,976 ms, real time = 1h, 3,472 ms.
?
Author
Followup work is happening in gist parallel_dfs_bfs_evenly.cc.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Started with 19200 %CPU, then quickly dropped.
For many hours did run with 300 %CPU, then more hours with 200 %CPU.
Finally completed after many more hours with 100 %CPU.
14:28h wall-clock time in total (71:40h user time):