Last active
October 21, 2025 05:26
-
-
Save CEXT-Dan/be21e162006a0afea6dee42d7a91677c to your computer and use it in GitHub Desktop.
The One Billion Row Challenge
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers | |
| // Windows Header Files | |
| #include <windows.h> | |
| #include "tchar.h" | |
| #include <iostream> | |
| #include <string> | |
| #include <format> | |
| #include <mutex> | |
| #include <unordered_map> | |
| #include <execution> | |
| #include <chrono> | |
| #include <fstream> | |
| #include <thread> | |
| #include <cassert> | |
| std::mutex map_mutex; | |
| //-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--= | |
| // PerfTimer | |
| class PerfTimer | |
| { | |
| std::chrono::high_resolution_clock::time_point t1; | |
| std::chrono::high_resolution_clock::time_point t2; | |
| public: | |
| PerfTimer(); | |
| ~PerfTimer() = default; | |
| std::string end(); | |
| }; | |
| inline PerfTimer::PerfTimer() | |
| { | |
| t1 = std::chrono::high_resolution_clock::now(); | |
| } | |
| inline std::string PerfTimer::end() | |
| { | |
| t2 = std::chrono::high_resolution_clock::now(); | |
| std::chrono::duration<double> elapsedTime = duration_cast<std::chrono::duration<double>>(t2 - t1); | |
| return std::format("\nDone! {} seconds", elapsedTime.count()); | |
| } | |
| //-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--= | |
| // FileHnd | |
| //------------------------------------------------------------------------------------- | |
| //AcDbObjectUPtr | |
| class FileHnd | |
| { | |
| public: | |
| FileHnd(HANDLE h) | |
| :_h(h) | |
| { | |
| } | |
| ~FileHnd() | |
| { | |
| if (CloseHandle(_h) == FALSE) | |
| assert(0); | |
| } | |
| HANDLE hnd() const | |
| { | |
| return _h; | |
| } | |
| private: | |
| HANDLE _h; | |
| }; | |
| //-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--= | |
| // FileMap | |
| class FileMap | |
| { | |
| public: | |
| FileMap(LPVOID fmap, size_t len) | |
| :_fmap((char*)fmap), _fend((char*)fmap + len), _len(len) | |
| { | |
| } | |
| ~FileMap() | |
| { | |
| if (UnmapViewOfFile(_fmap) == FALSE) | |
| assert(0); | |
| } | |
| char* beginf() const | |
| { | |
| return _fmap; | |
| } | |
| char* endf() const | |
| { | |
| return _fend; | |
| } | |
| size_t size() const | |
| { | |
| return _len; | |
| } | |
| private: | |
| char* _fmap = nullptr; | |
| char* _fend = nullptr; | |
| size_t _len = 0; | |
| }; | |
| //-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--= | |
| // Value | |
| struct Temperature | |
| { | |
| int64_t sum; | |
| int64_t count; | |
| int minv; | |
| int maxv; | |
| Temperature() | |
| : sum(0), count(0), | |
| minv(300), | |
| maxv(-300) | |
| { | |
| } | |
| inline void set(int v) { | |
| sum += v; | |
| count++; | |
| if (v < minv) minv = v; | |
| if (v > maxv) maxv = v; | |
| } | |
| inline void merge(const Temperature& o) | |
| { | |
| sum += o.sum; | |
| count += o.count; | |
| if (o.minv < minv) minv = o.minv; | |
| if (o.maxv > maxv) maxv = o.maxv; | |
| } | |
| inline const std::string toString() const | |
| { | |
| double avg = ((double)sum / count) / 10.0; | |
| return std::format("{:.1f}/{:.1f}/{:.1f}", (minv / 10.0), avg, maxv / 10.0); | |
| } | |
| }; | |
| using TempPackage = std::pair<std::string, Temperature>; | |
| //14.3 to 143 | |
| inline static int faststoi(const char* input, size_t sz) | |
| { | |
| int v = 0; | |
| bool neg = false; | |
| const char* dec_prt = input; | |
| const char* dec_prt_end = dec_prt + sz; | |
| if (*dec_prt == '-') | |
| { | |
| neg = true; | |
| dec_prt++; | |
| } | |
| int ip = 0; | |
| while (dec_prt < dec_prt_end && *dec_prt != '.') | |
| { | |
| ip = ip * 10 + (*dec_prt - '0'); | |
| dec_prt++; | |
| } | |
| dec_prt++; // skip dot | |
| int dp = *dec_prt - '0'; | |
| v = ip * 10 + dp; | |
| if (neg) | |
| v = -v; | |
| return v; | |
| } | |
| //-==-==-===-=-=-=-==-=-=-=-=--=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--= | |
| // splitFileIntoChunks | |
| static std::vector<std::string_view> splitFileIntoChunks(const std::string_view& fileContent, size_t fileSize) | |
| { | |
| std::vector<std::string_view> chunks; | |
| const size_t nthreads = std::thread::hardware_concurrency() ? std::thread::hardware_concurrency() : 4; | |
| if (fileSize == 0 || fileContent.empty()) { | |
| return chunks; | |
| } | |
| chunks.reserve(nthreads); | |
| const size_t approxChunkSize = fileSize / nthreads; | |
| size_t currentPos = 0; | |
| for (size_t threadIdx = 0; threadIdx < nthreads; ++threadIdx) { | |
| const size_t chunkStart = currentPos; | |
| if (threadIdx == nthreads - 1) { | |
| // Last chunk gets the remainder | |
| chunks.emplace_back(fileContent.substr(chunkStart)); | |
| } | |
| else { | |
| // Find the next newline after the approximate chunk boundary | |
| size_t chunkEnd = (threadIdx + 1) * approxChunkSize; | |
| while (chunkEnd < fileSize && fileContent[chunkEnd] != '\n') { | |
| ++chunkEnd; | |
| } | |
| // Include the newline in the chunk | |
| if (chunkEnd < fileSize) { | |
| ++chunkEnd; | |
| } | |
| chunks.emplace_back(fileContent.substr(chunkStart, chunkEnd - chunkStart)); | |
| currentPos = chunkEnd; | |
| } | |
| } | |
| return chunks; | |
| } | |
| // main | |
| int main(int argc, char* argv[]) | |
| { | |
| if (argc != 3) | |
| { | |
| std::cout << "Usage: " << argv[0] << " <input_file> <output_file>\n"; | |
| return 1; | |
| } | |
| const std::string inpath(argv[1]); | |
| const std::string outpath(argv[2]); | |
| PerfTimer timer; | |
| //read file into memory | |
| FileHnd fh(CreateFileA(inpath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)); | |
| if (fh.hnd() == INVALID_HANDLE_VALUE) | |
| { | |
| std::cout << "Error: Could not open input file '" << inpath << "'\n"; | |
| return 1; | |
| } | |
| LARGE_INTEGER fileSize; | |
| if (!GetFileSizeEx(fh.hnd(), &fileSize)) | |
| { | |
| std::cout << "Failed to get file size for: '" << inpath << "'\n"; | |
| return 1; | |
| } | |
| FileHnd fhm(CreateFileMapping(fh.hnd(), NULL, PAGE_READONLY, 0, 0, NULL)); | |
| if (fhm.hnd() != INVALID_HANDLE_VALUE) | |
| { | |
| size_t cb = static_cast<size_t>(fileSize.QuadPart); | |
| FileMap mv(MapViewOfFile(fhm.hnd(), FILE_MAP_READ, 0, 0, cb), cb); | |
| std::string_view fullView{ mv.beginf(), mv.size() }; | |
| //split fullView into chunks | |
| std::vector<std::string_view> chunks = splitFileIntoChunks(fullView, cb); | |
| //split the city name and dec part Kunming;19.8 so we can compute the avg | |
| constexpr size_t reserve = 50000; | |
| std::unordered_map<std::string_view, Temperature> gmap; | |
| gmap.reserve(reserve); | |
| std::for_each(std::execution::par, chunks.begin(), chunks.end(), [&](const std::string_view& v) | |
| { | |
| std::unordered_map<std::string_view, Temperature> tmap; | |
| tmap.reserve(reserve); | |
| for (auto iter = v.data(), endp = v.data() + v.length(); iter < endp; iter++) | |
| { | |
| //city | |
| char* p = (char*)memchr(iter, ';', endp - iter); | |
| if (!p) [[unlikely]] | |
| continue; | |
| std::string_view svcity(iter, p - iter); | |
| iter = p + 1; | |
| //dec | |
| char* q = (char*)memchr(iter, '\n', endp - iter); | |
| if (!q) [[unlikely]] | |
| continue; | |
| tmap[svcity].set(faststoi(iter, (q - 1) - p)); | |
| iter = q; | |
| } | |
| { | |
| std::lock_guard<std::mutex> guard(map_mutex); | |
| if (gmap.size() == 0) | |
| { | |
| std::swap(tmap, gmap); | |
| } | |
| else | |
| { | |
| for (const auto& item : tmap) | |
| gmap[item.first].merge(item.second); | |
| } | |
| } | |
| }); | |
| //sort | |
| std::vector<TempPackage> vec; | |
| { | |
| vec.reserve(gmap.size()); | |
| for (const auto& item : gmap) | |
| vec.emplace_back(TempPackage{ item.first ,item.second }); | |
| std::sort(std::execution::par, vec.begin(), vec.end(), [&](const TempPackage& l, const TempPackage& r) | |
| { | |
| return l.first < r.first; | |
| }); | |
| } | |
| //write | |
| std::ofstream outFile(outpath); | |
| if (outFile.is_open()) | |
| { | |
| bool first = true; | |
| outFile << '{'; | |
| for (const auto& [city, temperature] : vec) | |
| { | |
| std::string fmt = ", {}={}"; | |
| if (first) | |
| { | |
| fmt = "{}={}"; | |
| first = false; | |
| } | |
| outFile << std::vformat(fmt, std::make_format_args(city, temperature.toString())); | |
| } | |
| outFile << '}'; | |
| outFile.close(); | |
| } | |
| std::cout << timer.end() << "\n"; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment