Skip to content

Instantly share code, notes, and snippets.

@CEXT-Dan
Last active October 21, 2025 05:26
Show Gist options
  • Select an option

  • Save CEXT-Dan/be21e162006a0afea6dee42d7a91677c to your computer and use it in GitHub Desktop.

Select an option

Save CEXT-Dan/be21e162006a0afea6dee42d7a91677c to your computer and use it in GitHub Desktop.
The One Billion Row Challenge
#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
// Windows Header Files
#include <windows.h>
#include "tchar.h"
#include <iostream>
#include <string>
#include <format>
#include <mutex>
#include <unordered_map>
#include <execution>
#include <chrono>
#include <fstream>
#include <thread>
#include <cassert>
std::mutex map_mutex;
//-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=
// PerfTimer
class PerfTimer
{
std::chrono::high_resolution_clock::time_point t1;
std::chrono::high_resolution_clock::time_point t2;
public:
PerfTimer();
~PerfTimer() = default;
std::string end();
};
inline PerfTimer::PerfTimer()
{
t1 = std::chrono::high_resolution_clock::now();
}
inline std::string PerfTimer::end()
{
t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsedTime = duration_cast<std::chrono::duration<double>>(t2 - t1);
return std::format("\nDone! {} seconds", elapsedTime.count());
}
//-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=
// FileHnd
//-------------------------------------------------------------------------------------
//AcDbObjectUPtr
class FileHnd
{
public:
FileHnd(HANDLE h)
:_h(h)
{
}
~FileHnd()
{
if (CloseHandle(_h) == FALSE)
assert(0);
}
HANDLE hnd() const
{
return _h;
}
private:
HANDLE _h;
};
//-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=
// FileMap
class FileMap
{
public:
FileMap(LPVOID fmap, size_t len)
:_fmap((char*)fmap), _fend((char*)fmap + len), _len(len)
{
}
~FileMap()
{
if (UnmapViewOfFile(_fmap) == FALSE)
assert(0);
}
char* beginf() const
{
return _fmap;
}
char* endf() const
{
return _fend;
}
size_t size() const
{
return _len;
}
private:
char* _fmap = nullptr;
char* _fend = nullptr;
size_t _len = 0;
};
//-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=
// Value
struct Temperature
{
int64_t sum;
int64_t count;
int minv;
int maxv;
Temperature()
: sum(0), count(0),
minv(300),
maxv(-300)
{
}
inline void set(int v) {
sum += v;
count++;
if (v < minv) minv = v;
if (v > maxv) maxv = v;
}
inline void merge(const Temperature& o)
{
sum += o.sum;
count += o.count;
if (o.minv < minv) minv = o.minv;
if (o.maxv > maxv) maxv = o.maxv;
}
inline const std::string toString() const
{
double avg = ((double)sum / count) / 10.0;
return std::format("{:.1f}/{:.1f}/{:.1f}", (minv / 10.0), avg, maxv / 10.0);
}
};
using TempPackage = std::pair<std::string, Temperature>;
//14.3 to 143
inline static int faststoi(const char* input, size_t sz)
{
int v = 0;
bool neg = false;
const char* dec_prt = input;
const char* dec_prt_end = dec_prt + sz;
if (*dec_prt == '-')
{
neg = true;
dec_prt++;
}
int ip = 0;
while (dec_prt < dec_prt_end && *dec_prt != '.')
{
ip = ip * 10 + (*dec_prt - '0');
dec_prt++;
}
dec_prt++; // skip dot
int dp = *dec_prt - '0';
v = ip * 10 + dp;
if (neg)
v = -v;
return v;
}
//-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=
// splitFileIntoChunks
static std::vector<std::string_view> splitFileIntoChunks(const std::string_view& fileContent, size_t fileSize)
{
std::vector<std::string_view> chunks;
const size_t nthreads = std::thread::hardware_concurrency() ? std::thread::hardware_concurrency() : 4;
if (fileSize == 0 || fileContent.empty()) {
return chunks;
}
chunks.reserve(nthreads);
const size_t approxChunkSize = fileSize / nthreads;
size_t currentPos = 0;
for (size_t threadIdx = 0; threadIdx < nthreads; ++threadIdx) {
const size_t chunkStart = currentPos;
if (threadIdx == nthreads - 1) {
// Last chunk gets the remainder
chunks.emplace_back(fileContent.substr(chunkStart));
}
else {
// Find the next newline after the approximate chunk boundary
size_t chunkEnd = (threadIdx + 1) * approxChunkSize;
while (chunkEnd < fileSize && fileContent[chunkEnd] != '\n') {
++chunkEnd;
}
// Include the newline in the chunk
if (chunkEnd < fileSize) {
++chunkEnd;
}
chunks.emplace_back(fileContent.substr(chunkStart, chunkEnd - chunkStart));
currentPos = chunkEnd;
}
}
return chunks;
}
// main
int main(int argc, char* argv[])
{
if (argc != 3)
{
std::cout << "Usage: " << argv[0] << " <input_file> <output_file>\n";
return 1;
}
const std::string inpath(argv[1]);
const std::string outpath(argv[2]);
PerfTimer timer;
//read file into memory
FileHnd fh(CreateFileA(inpath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL));
if (fh.hnd() == INVALID_HANDLE_VALUE)
{
std::cout << "Error: Could not open input file '" << inpath << "'\n";
return 1;
}
LARGE_INTEGER fileSize;
if (!GetFileSizeEx(fh.hnd(), &fileSize))
{
std::cout << "Failed to get file size for: '" << inpath << "'\n";
return 1;
}
FileHnd fhm(CreateFileMapping(fh.hnd(), NULL, PAGE_READONLY, 0, 0, NULL));
if (fhm.hnd() != INVALID_HANDLE_VALUE)
{
size_t cb = static_cast<size_t>(fileSize.QuadPart);
FileMap mv(MapViewOfFile(fhm.hnd(), FILE_MAP_READ, 0, 0, cb), cb);
std::string_view fullView{ mv.beginf(), mv.size() };
//split fullView into chunks
std::vector<std::string_view> chunks = splitFileIntoChunks(fullView, cb);
//split the city name and dec part Kunming;19.8 so we can compute the avg
constexpr size_t reserve = 50000;
std::unordered_map<std::string_view, Temperature> gmap;
gmap.reserve(reserve);
std::for_each(std::execution::par, chunks.begin(), chunks.end(), [&](const std::string_view& v)
{
std::unordered_map<std::string_view, Temperature> tmap;
tmap.reserve(reserve);
for (auto iter = v.data(), endp = v.data() + v.length(); iter < endp; iter++)
{
//city
char* p = (char*)memchr(iter, ';', endp - iter);
if (!p) [[unlikely]]
continue;
std::string_view svcity(iter, p - iter);
iter = p + 1;
//dec
char* q = (char*)memchr(iter, '\n', endp - iter);
if (!q) [[unlikely]]
continue;
tmap[svcity].set(faststoi(iter, (q - 1) - p));
iter = q;
}
{
std::lock_guard<std::mutex> guard(map_mutex);
if (gmap.size() == 0)
{
std::swap(tmap, gmap);
}
else
{
for (const auto& item : tmap)
gmap[item.first].merge(item.second);
}
}
});
//sort
std::vector<TempPackage> vec;
{
vec.reserve(gmap.size());
for (const auto& item : gmap)
vec.emplace_back(TempPackage{ item.first ,item.second });
std::sort(std::execution::par, vec.begin(), vec.end(), [&](const TempPackage& l, const TempPackage& r)
{
return l.first < r.first;
});
}
//write
std::ofstream outFile(outpath);
if (outFile.is_open())
{
bool first = true;
outFile << '{';
for (const auto& [city, temperature] : vec)
{
std::string fmt = ", {}={}";
if (first)
{
fmt = "{}={}";
first = false;
}
outFile << std::vformat(fmt, std::make_format_args(city, temperature.toString()));
}
outFile << '}';
outFile.close();
}
std::cout << timer.end() << "\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment