-
-
Save Hermann-SW/4b197c6e8bf9f65c7a074ad6271ddce4 to your computer and use it in GitHub Desktop.
| /* NOLINT(legal/copyright) | |
| f=parallel_dfs_bfs_evenly | |
| g++ $f.cc -O3 -Wall -pedantic -Wextra -fopenmp -o $f | |
| cpplint $f.cc | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cc --check-config | |
| */ | |
| #include <string.h> | |
| #include <omp.h> | |
| #include <iostream> | |
| #include <cassert> | |
| #include <limits> | |
| #include <tuple> | |
| #include <queue> | |
| typedef __int64_t num; | |
| typedef __int128_t num2; | |
| num c; | |
| char *V; | |
| std::queue<std::tuple<num, num, bool, bool>> *Q; | |
| const bool lft = false; | |
| const bool rgt = true; | |
| /* Longest directed acyclic graph path is that for p=5 (3,6,8,11,...). | |
| Path has 40% of 10^n vertices, so DAG height is 40 billion for n=11. | |
| 0 5 10 15 20 25 30 | |
| + - - - - + - - - - + - - - - + - - - - + - - - - + - - - - +... | |
| 0+ _____3___ ________________________________20_.. | |
| ! ^ \___ _____8__ _.. (*) | |
| ! ! ^ 6 \___ ____13__ _.. | |
| ! ! ! \______ 11 \___ ____18__ | |
| ! {3,5,lft,true} ! \ \.. 16 \___ _.. | |
| 5+ ! 10_.. (@) \.. 21 | |
| ! {6,5,rgt,true}--+ (#) \____ \.. | |
| ! ^ \___ | |
| ! {15,13,rgt,true}---------------+ \ _.. (*) | |
| ! 15 {20,29,lft,false} | |
| 10+ (#) 10²+11² == 14²+5² => 221 composite \.. | |
| . (@) 16²+17² == 23²+4² => 545 composite | |
| \ | |
| (left half of) great-grandparent DAG # | |
| https://oeis.org/A000071 3 (=5²) / \ | |
| 6 / \ | |
| 10 8 / \ | |
| 15 23 11 20 (=29²) # # | |
| 21 23 27 41 13 37 / \ / \ | |
| 28 52 28 61 40 88 64 16 59 49 # # | |
| 36......96..........106......92..........119 (=169²) / \ / \ | |
| # | |
| / \ */ | |
| num other(num v, num p) { | |
| num2 x = v; | |
| x = 2*x*(x+1)+1; | |
| assert(x%p == 0); | |
| x /= p; | |
| if (x > std::numeric_limits<num>::max()) { | |
| return std::numeric_limits<num>::max(); | |
| } | |
| return x; | |
| } | |
| num next(num v, num p) { | |
| num r = v%p; | |
| return v + (2*r < p ? p-2*r-1 : 2*(p-r)-1); | |
| } | |
| int main(int argc, char *argv[]) { | |
| assert(argc == 2); | |
| assert(atoi(argv[1]) >= 4); | |
| num mx = 1; | |
| for (int e = 1; e <= atoi(argv[1]); ++e) { | |
| mx *= 10; | |
| } | |
| V = new char[1 + mx]; | |
| assert(V); | |
| explicit_bzero(V, 1+mx); | |
| int nproc = omp_get_max_threads(); | |
| omp_set_num_threads(nproc); | |
| Q = new std::queue<std::tuple<num, num, bool, bool>>[nproc]; | |
| assert(Q); | |
| __int64_t five = 1; // {3, 5, lft, true} | |
| _Pragma("omp parallel for") | |
| for (int i = 0; i < nproc; ++i) { | |
| bool done = false; | |
| for (;;) { | |
| while (!Q[i].empty()) { | |
| num v = std::get<0>(Q[i].front()); | |
| num p = std::get<1>(Q[i].front()); | |
| bool l = std::get<2>(Q[i].front()); | |
| bool b = std::get<3>(Q[i].front()); | |
| Q[i].pop(); | |
| if (v > mx) continue; | |
| V[v] = 1; | |
| num q = other(v, p); | |
| if (l == lft) { | |
| if (q != p) Q[i].push({next(v, q), q, lft, false}); | |
| Q[i].push({next(v, p), p, rgt, b}); | |
| } else { | |
| Q[i].push({next(v, p), p, lft, false}); | |
| if (b) Q[i].push({next(v, q), q, rgt, true}); | |
| } | |
| } | |
| __int64_t job; | |
| #pragma omp atomic capture | |
| job = five++; | |
| done = (5*(job >> 1) >= mx); | |
| if (!done) { | |
| num v = 5*(job >> 1) + 1 + 2*(job%2); | |
| num p = 5; | |
| bool l = (job%2 == 1); | |
| bool b = (job < 3); | |
| if (v > mx) { | |
| done = true; | |
| } else { | |
| V[v] = 1; | |
| num q = other(v, p); | |
| if (l == lft) { | |
| if (q != p) Q[i].push({next(v, q), q, lft, false}); | |
| } else { | |
| if (b) Q[i].push({next(v, q), q, rgt, true}); | |
| } | |
| } | |
| } | |
| if (done) break; | |
| } // for(;;) | |
| // if (done) { std::cout << i << " done " << five << std::endl; continue; } | |
| } // _Pragma("omp parallel for") | |
| num c = mx, i; | |
| for (i = 1; i <= mx; ++i) c-=V[i]; | |
| std::cout << c << "\n"; | |
| return 0; | |
| } |
After 14:37h wall-clock time (9d+10h user time) I stopped the computation.
Because it was longer than with par.cc (14:28h) of the other gist.
Resident memory 351.8g, virtual memory 366.3g, still 200 %CPU.
What is the reason for taking longer, although much longer 192 cores were actively working?
par.cc did sequential DFS of height 16 first and created 2090 (smaller) units of work.
parallel_dfs_bfs_evenly.cc did split all work from p=5 path in the DAG.
One core got work item {6,17,rgt,true} and split {10,17,lft,false} from it.
And then split {15,13,rgt,true} from that.
So that core handles p=13 and p=17 paths alone, with 2/13*10^11 and 2/17*10^11 vertices.
Alone these two work items account for 27,149,321,267 vertices to be processed by that core alone :-(
Solution will be to efficiently determine whether other cores did run out of work.
This can be efficiently and without atomic operation checked by (5*(five>> 1) >= mx).
This is true if no more work can be generated along the 40 billion vertices p=5 path.
A core detecting that will pass work to a core that did run out of work in a safe way.
And then continue its computation. That way all 192 cores should keep working the whole time.
The array of size 1+10^n is needed for sieving.
But the work items get stored as two __int128_t and two bools.
__int128_t is only needed inside of num other(num, num) to avoid overflow in its computation for n≥10.
But not outside, so that 351.8g memory should get reduced to only 225g while doing same computation.
Revision 4 turns num type to 64bit integer, new num2 type is 128bit integer used only inside num other(num, num).
Revision 3 adds doc on the great-grandparent directed acyclic graph that gets traversed.
For n=11 the height of that DAG is 40 billion ...