-
-
Save Hermann-SW/4b197c6e8bf9f65c7a074ad6271ddce4 to your computer and use it in GitHub Desktop.
| /* NOLINT(legal/copyright) | |
| f=parallel_dfs_bfs_evenly | |
| g++ $f.cc -O3 -Wall -pedantic -Wextra -fopenmp -o $f | |
| cpplint $f.cc | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cc --check-config | |
| */ | |
| #include <string.h> | |
| #include <omp.h> | |
| #include <iostream> | |
| #include <cassert> | |
| #include <limits> | |
| #include <tuple> | |
| #include <queue> | |
| typedef __int64_t num; | |
| typedef __int128_t num2; | |
| num c; | |
| char *V; | |
| std::queue<std::tuple<num, num, bool, bool>> *Q; | |
| const bool lft = false; | |
| const bool rgt = true; | |
| /* Longest directed acyclic graph path is that for p=5 (3,6,8,11,...). | |
| Path has 40% of 10^n vertices, so DAG height is 40 billion for n=11. | |
| 0 5 10 15 20 25 30 | |
| + - - - - + - - - - + - - - - + - - - - + - - - - + - - - - +... | |
| 0+ _____3___ ________________________________20_.. | |
| ! ^ \___ _____8__ _.. (*) | |
| ! ! ^ 6 \___ ____13__ _.. | |
| ! ! ! \______ 11 \___ ____18__ | |
| ! {3,5,lft,true} ! \ \.. 16 \___ _.. | |
| 5+ ! 10_.. (@) \.. 21 | |
| ! {6,5,rgt,true}--+ (#) \____ \.. | |
| ! ^ \___ | |
| ! {15,13,rgt,true}---------------+ \ _.. (*) | |
| ! 15 {20,29,lft,false} | |
| 10+ (#) 10²+11² == 14²+5² => 221 composite \.. | |
| . (@) 16²+17² == 23²+4² => 545 composite | |
| \ | |
| (left half of) great-grandparent DAG # | |
| https://oeis.org/A000071 3 (=5²) / \ | |
| 6 / \ | |
| 10 8 / \ | |
| 15 23 11 20 (=29²) # # | |
| 21 23 27 41 13 37 / \ / \ | |
| 28 52 28 61 40 88 64 16 59 49 # # | |
| 36......96..........106......92..........119 (=169²) / \ / \ | |
| # | |
| / \ */ | |
| num other(num v, num p) { | |
| num2 x = v; | |
| x = 2*x*(x+1)+1; | |
| assert(x%p == 0); | |
| x /= p; | |
| if (x > std::numeric_limits<num>::max()) { | |
| return std::numeric_limits<num>::max(); | |
| } | |
| return x; | |
| } | |
| num next(num v, num p) { | |
| num r = v%p; | |
| return v + (2*r < p ? p-2*r-1 : 2*(p-r)-1); | |
| } | |
| int main(int argc, char *argv[]) { | |
| assert(argc == 2); | |
| assert(atoi(argv[1]) >= 4); | |
| num mx = 1; | |
| for (int e = 1; e <= atoi(argv[1]); ++e) { | |
| mx *= 10; | |
| } | |
| V = new char[1 + mx]; | |
| assert(V); | |
| explicit_bzero(V, 1+mx); | |
| int nproc = omp_get_max_threads(); | |
| omp_set_num_threads(nproc); | |
| Q = new std::queue<std::tuple<num, num, bool, bool>>[nproc]; | |
| assert(Q); | |
| __int64_t five = 1; // {3, 5, lft, true} | |
| _Pragma("omp parallel for") | |
| for (int i = 0; i < nproc; ++i) { | |
| bool done = false; | |
| for (;;) { | |
| while (!Q[i].empty()) { | |
| num v = std::get<0>(Q[i].front()); | |
| num p = std::get<1>(Q[i].front()); | |
| bool l = std::get<2>(Q[i].front()); | |
| bool b = std::get<3>(Q[i].front()); | |
| Q[i].pop(); | |
| if (v > mx) continue; | |
| V[v] = 1; | |
| num q = other(v, p); | |
| if (l == lft) { | |
| if (q != p) Q[i].push({next(v, q), q, lft, false}); | |
| Q[i].push({next(v, p), p, rgt, b}); | |
| } else { | |
| Q[i].push({next(v, p), p, lft, false}); | |
| if (b) Q[i].push({next(v, q), q, rgt, true}); | |
| } | |
| } | |
| __int64_t job; | |
| #pragma omp atomic capture | |
| job = five++; | |
| done = (5*(job >> 1) >= mx); | |
| if (!done) { | |
| num v = 5*(job >> 1) + 1 + 2*(job%2); | |
| num p = 5; | |
| bool l = (job%2 == 1); | |
| bool b = (job < 3); | |
| if (v > mx) { | |
| done = true; | |
| } else { | |
| V[v] = 1; | |
| num q = other(v, p); | |
| if (l == lft) { | |
| if (q != p) Q[i].push({next(v, q), q, lft, false}); | |
| } else { | |
| if (b) Q[i].push({next(v, q), q, rgt, true}); | |
| } | |
| } | |
| } | |
| if (done) break; | |
| } // for(;;) | |
| // if (done) { std::cout << i << " done " << five << std::endl; continue; } | |
| } // _Pragma("omp parallel for") | |
| num c = mx, i; | |
| for (i = 1; i <= mx; ++i) c-=V[i]; | |
| std::cout << c << "\n"; | |
| return 0; | |
| } |
After 21 minutes runtime (60h user time) process uses 130.3g resident and 141.8g virtual memory.
Still around 19200 %CPU.
After 50 minutes (6d+13h (!) user time):
164.9g resident, 177.0g virtual memoty, still 19200 %CPU (distribute work evenly seems to work).
Babysitting run was useful, after 1:04h (7d+21h user time):
192.6g resident, 205.8g virtual memory for parallel_dfs_bfs_evenly.
Now only around 1100 %CPU are used, so only 11 cores still have work to do.
After 6h wall-clock time (8d+16h user time):
354.8g resident, 369.2g virtual memory, now with 300 %CPU.
After 10h wall-clock time (9d+1h user time):
351.8g resident, 366.3g virtual memory, now with only 200 %CPU, two cores left with work.
Revision 2 is reformatting (indentation) only as basis for future work
Revision 3 adds doc on the great-grandparent directed acyclic graph that gets traversed.
For n=11 the height of that DAG is 40 billion ...
After 14:37h wall-clock time (9d+10h user time) I stopped the computation.
Because it was longer than with par.cc (14:28h) of the other gist.
Resident memory 351.8g, virtual memory 366.3g, still 200 %CPU.
What is the reason for taking longer, although much longer 192 cores were actively working?
par.cc did sequential DFS of height 16 first and created 2090 (smaller) units of work.
parallel_dfs_bfs_evenly.cc did split all work from p=5 path in the DAG.
One core got work item {6,17,rgt,true} and split {10,17,lft,false} from it.
And then split {15,13,rgt,true} from that.
So that core handles p=13 and p=17 paths alone, with 2/13*10^11 and 2/17*10^11 vertices.
Alone these two work items account for 27,149,321,267 vertices to be processed by that core alone :-(
Solution will be to efficiently determine whether other cores did run out of work.
This can be efficiently and without atomic operation checked by (5*(five>> 1) >= mx).
This is true if no more work can be generated along the 40 billion vertices p=5 path.
A core detecting that will pass work to a core that did run out of work in a safe way.
And then continue its computation. That way all 192 cores should keep working the whole time.
The array of size 1+10^n is needed for sieving.
But the work items get stored as two __int128_t and two bools.
__int128_t is only needed inside of num other(num, num) to avoid overflow in its computation for n≥10.
But not outside, so that 351.8g memory should get reduced to only 225g while doing same computation.
Revision 4 turns num type to 64bit integer, new num2 type is 128bit integer used only inside num other(num, num).
Followup on this gist:
https://gist.github.com/Hermann-SW/94241e719dd3518a3276d804b627319d
Mixed dfs/bfs algorithm should produce highly parallel workload much longer than previous gist.
Ongoing run on 8-socket 192C/384T Lenovo x3950-X6 server with Intel Xeon 8890v4 CPUs.
Started with
nohup ./doit2:parallel_dfs_bfs_evenlyprocess uses 125.9g resident and 137.4g virtual memory,after 16 minutes runtime (46h user time already), still running with 19200 %CPU .