Last active
January 7, 2026 00:22
-
-
Save Hermann-SW/94241e719dd3518a3276d804b627319d to your computer and use it in GitHub Desktop.
Determine pi_{k^2+(k+1)^2}(10^n) (for all k<=10^n)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // NOLINT(legal/copyright) | |
| // g++ par.cc -O3 -Wall -pedantic -Wextra -o par -fopenmp | |
| // (cpplinted and cppchecked) | |
| // | |
| #include <string.h> | |
| #include <iostream> | |
| #include <cassert> | |
| #include <tuple> | |
| #include <queue> | |
| typedef __int128_t num; | |
| const int lev = 16; // results in ceil((fib(19)-1)/2) = 2090 work items | |
| num c; | |
| char *V; | |
| std::queue<std::tuple<num, num, bool, bool>> Q[2090]; | |
| int nxt = 0; | |
| const bool lft = false; | |
| const bool rgt = true; | |
| num other(num v, num p) { | |
| num x = 2*v*(v+1)+1; | |
| assert(x%p == 0); | |
| return x/p; | |
| } | |
| num next(num v, num p) { | |
| num r = v%p; | |
| return v + (2*r < p ? p-2*r-1 : 2*(p-r)-1); | |
| } | |
| void right(num v, num p, int l, bool b); | |
| void left(num v, num p, int l, bool b) { | |
| if (l == 0) { Q[nxt++].push({v, p, lft, b}); return; } | |
| V[v] = 1; | |
| num q = other(v, p); | |
| if (q != p) left(next(v, q), q, l-1, false); | |
| right(next(v, p), p, l-1, b); | |
| } | |
| void right(num v, num p, int l, bool b) { | |
| if (l == 0) { Q[nxt++].push({v, p, rgt, b}); return; } | |
| V[v] = 1; | |
| num q = other(v, p); | |
| left(next(v, p), p, l-1, false); | |
| if (b) right(next(v, q), q, l-1, true); | |
| } | |
| int main(int argc, char *argv[]) { | |
| assert(argc == 2); | |
| assert(atoi(argv[1]) >= 4); | |
| num mx = 1; | |
| for (int e = 1; e <= atoi(argv[1]); ++e) { | |
| mx *= 10; | |
| } | |
| V = new char[1 + mx]; | |
| assert(V); | |
| explicit_bzero(V, 1+mx); | |
| left(3, 5, lev, true); | |
| _Pragma("omp parallel for") | |
| for (int i = 0; i < 2090; ++i) { | |
| while (!Q[i].empty()) { | |
| num v = std::get<0>(Q[i].front()); | |
| num p = std::get<1>(Q[i].front()); | |
| bool l = std::get<2>(Q[i].front()); | |
| bool b = std::get<3>(Q[i].front()); | |
| Q[i].pop(); | |
| if (v > mx) continue; | |
| V[v] = 1; | |
| num q = other(v, p); | |
| if (l == lft) { | |
| if (q != p) Q[i].push({next(v, q), q, lft, false}); | |
| Q[i].push({next(v, p), p, rgt, b}); | |
| } else { | |
| Q[i].push({next(v, p), p, lft, false}); | |
| if (b) Q[i].push({next(v, q), q, rgt, true}); | |
| } | |
| } | |
| } // _Pragma("omp parallel for") | |
| num c = mx, i; | |
| for (i = 1; i <= mx; ++i) c-=V[i]; | |
| std::cout << static_cast<__int64_t>(c) << "\n"; | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
From previous comment, the first parallel version did take only 22min to determine a(10):
Previous (sequential) computation of a(10) with PARI/GP took 23h on AMD 7950X CPU:
Today I used parsum() instead of sum() on 192C/384T 8-socket server with slower Intel Xeon 8890v4 CPUS.
Reduction of runtime from 23h to 19 minutes ... wow:
For numbers represented as 64bit long isprime() calls ispseudoprime() which is much faster than isprime() and is correct up to 2^64. Computation for 10^10 ks without ispseudoprime speedup takes 1h: