Created
March 13, 2026 14:28
-
-
Save puzpuzpuz/e5989707b264ca6b1d620907aa8f2248 to your computer and use it in GitHub Desktop.
Test data distributions for HORIZON JOIN adaptive scan benchmarking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- ============================================================================= | |
| -- Test data distributions for HORIZON JOIN adaptive scan benchmarking. | |
| -- | |
| -- Each scenario generates fx_trades_* and market_data_* with different symbol | |
| -- cardinality, gap sizes, and key distributions to exercise the adaptive | |
| -- backward-to-forward scan switch. | |
| -- | |
| -- Benchmark query template (replace suffixes): | |
| -- | |
| -- SELECT t.symbol, h.offset, avg(m.best_bid), avg(m.best_ask), count(*) | |
| -- FROM fx_trades_<suffix> t | |
| -- HORIZON JOIN market_data_<suffix> m ON (t.symbol = m.symbol) | |
| -- RANGE FROM -5s TO 5s STEP 1s AS h | |
| -- GROUP BY t.symbol, h.offset | |
| -- ORDER BY t.symbol, h.offset; | |
| -- | |
| -- ============================================================================= | |
| -- ============================================================================= | |
| -- SCENARIO 1: Baseline dense (K=10, gap~10) | |
| -- Both tables have similar density. Backward-only is trivially cheap. | |
| -- No adaptive switch expected. | |
| -- ============================================================================= | |
| CREATE TABLE market_data_dense ( | |
| timestamp TIMESTAMP, | |
| symbol SYMBOL, | |
| best_bid DOUBLE, | |
| best_ask DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR; | |
| -- 100M rows, 100us spacing, 10 symbols. Spans ~2.78 hours. | |
| INSERT INTO market_data_dense | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp, | |
| rnd_symbol_zipf( | |
| 'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD', | |
| 'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY', | |
| 1.2 | |
| ) symbol, | |
| rnd_double() * 0.5 + 1.0, | |
| rnd_double() * 0.5 + 1.0 | |
| FROM long_sequence(100_000_000); | |
| CREATE TABLE fx_trades_dense ( | |
| timestamp TIMESTAMP_NS, | |
| symbol SYMBOL, | |
| side SYMBOL, | |
| price DOUBLE, | |
| quantity DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR | |
| DEDUP UPSERT KEYS(timestamp, symbol); | |
| -- 1M rows, 10us spacing, 15 symbols. Gap per trade ~10 market_data rows. | |
| INSERT INTO fx_trades_dense | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:01:00.000000000', 10_000) timestamp, | |
| rnd_symbol_zipf( | |
| 'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD', | |
| 'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY', | |
| 'AUDJPY', 'EURAUD', 'EURCHF', 'GBPAUD', 'CADCHF', | |
| 1.2 | |
| ) symbol, | |
| rnd_symbol('BUY', 'SELL') side, | |
| 1.0 + rnd_double() * 0.5, | |
| round(rnd_double() * 5_000_000 + 100_000, 2) | |
| FROM long_sequence(1_000_000); | |
| -- ============================================================================= | |
| -- SCENARIO 2: Sparse LHS, few symbols (K=15, gap~2000) | |
| -- Large gap but small K. Backward-only is still efficient. | |
| -- No adaptive switch expected. | |
| -- ============================================================================= | |
| CREATE TABLE market_data_sparse ( | |
| timestamp TIMESTAMP, | |
| symbol SYMBOL, | |
| best_bid DOUBLE, | |
| best_ask DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR; | |
| -- 100M rows, 100us spacing, 10 symbols | |
| INSERT INTO market_data_sparse | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp, | |
| rnd_symbol_zipf( | |
| 'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD', | |
| 'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY', | |
| 1.2 | |
| ) symbol, | |
| rnd_double() * 0.5 + 1.0, | |
| rnd_double() * 0.5 + 1.0 | |
| FROM long_sequence(100_000_000); | |
| CREATE TABLE fx_trades_sparse ( | |
| timestamp TIMESTAMP_NS, | |
| symbol SYMBOL, | |
| side SYMBOL, | |
| price DOUBLE, | |
| quantity DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR | |
| DEDUP UPSERT KEYS(timestamp, symbol); | |
| -- 50K rows, 200ms spacing, 15 symbols. Gap per trade ~2,000 market_data rows. | |
| INSERT INTO fx_trades_sparse | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:01:00.000000000', 200_000_000) timestamp, | |
| rnd_symbol_zipf( | |
| 'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD', | |
| 'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY', | |
| 'AUDJPY', 'EURAUD', 'EURCHF', 'GBPAUD', 'CADCHF', | |
| 1.2 | |
| ) symbol, | |
| rnd_symbol('BUY', 'SELL') side, | |
| 1.0 + rnd_double() * 0.5, | |
| round(rnd_double() * 5_000_000 + 100_000, 2) | |
| FROM long_sequence(50_000); | |
| -- ============================================================================= | |
| -- SCENARIO 3: Rare symbols (K=200 with steep Zipf, gap~2000) | |
| -- 200 symbols with Zipf(2.0). The ~185 rarest symbols each appear only | |
| -- hundreds of times in 100M rows, causing deep backward scans. | |
| -- fx_trades uses flatter Zipf(0.5) to ensure rare symbols appear in trades. | |
| -- Adaptive switch expected after deep backward scans for rare keys. | |
| -- ============================================================================= | |
| CREATE TABLE market_data_rare ( | |
| timestamp TIMESTAMP, | |
| symbol SYMBOL, | |
| best_bid DOUBLE, | |
| best_ask DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR; | |
| -- 100M rows, 100us spacing, 200 symbols with steep Zipf. | |
| -- Rarest symbols (~sym200) appear ~1,500 times in 100M rows (gap ~66,000). | |
| INSERT INTO market_data_rare | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp, | |
| rnd_symbol_zipf(200, 2.0) symbol, | |
| rnd_double() * 0.5 + 1.0, | |
| rnd_double() * 0.5 + 1.0 | |
| FROM long_sequence(100_000_000); | |
| CREATE TABLE fx_trades_rare ( | |
| timestamp TIMESTAMP_NS, | |
| symbol SYMBOL, | |
| side SYMBOL, | |
| price DOUBLE, | |
| quantity DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR | |
| DEDUP UPSERT KEYS(timestamp, symbol); | |
| -- 50K rows, 200ms spacing, 200 symbols with flat Zipf. | |
| -- Flat Zipf(0.5) ensures rare symbols appear in trades (~50-100 times each). | |
| INSERT INTO fx_trades_rare | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:01:00.000000000', 200_000_000) timestamp, | |
| rnd_symbol_zipf(200, 0.5) symbol, | |
| rnd_symbol('BUY', 'SELL') side, | |
| 1.0 + rnd_double() * 0.5, | |
| round(rnd_double() * 5_000_000 + 100_000, 2) | |
| FROM long_sequence(50_000); | |
| -- ============================================================================= | |
| -- SCENARIO 4: High cardinality equities (K=5000, gap~2000) | |
| -- Simulates equity tick data with thousands of tickers. | |
| -- Even without rare keys, K=5000 means each position change requires | |
| -- scanning thousands of rows backward to populate the key map. | |
| -- Adaptive switch expected due to Zipf tail creating deep scans. | |
| -- ============================================================================= | |
| CREATE TABLE market_data_equity ( | |
| timestamp TIMESTAMP, | |
| symbol SYMBOL, | |
| best_bid DOUBLE, | |
| best_ask DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR; | |
| -- 100M rows, 100us spacing, 5000 symbols | |
| INSERT INTO market_data_equity | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp, | |
| rnd_symbol_zipf(5_000, 1.5) symbol, | |
| rnd_double() * 100.0 + 50.0, | |
| rnd_double() * 100.0 + 50.0 | |
| FROM long_sequence(100_000_000); | |
| CREATE TABLE fx_trades_equity ( | |
| timestamp TIMESTAMP_NS, | |
| symbol SYMBOL, | |
| side SYMBOL, | |
| price DOUBLE, | |
| quantity DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR | |
| DEDUP UPSERT KEYS(timestamp, symbol); | |
| -- 50K rows, 200ms spacing, 5000 symbols with flatter Zipf | |
| INSERT INTO fx_trades_equity | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:01:00.000000000', 200_000_000) timestamp, | |
| rnd_symbol_zipf(5_000, 1.0) symbol, | |
| rnd_symbol('BUY', 'SELL') side, | |
| 50.0 + rnd_double() * 100.0, | |
| round(rnd_double() * 10_000 + 100, 2) | |
| FROM long_sequence(50_000); | |
| -- ============================================================================= | |
| -- SCENARIO 5: Injected rare keys (K=10 common + 5 rare, gap~2000) | |
| -- Common FX symbols plus 5 exotic symbols planted only at the very beginning | |
| -- of market_data. Every trade with an exotic symbol triggers a backward scan | |
| -- across nearly the entire table. | |
| -- Adaptive switch expected very quickly. | |
| -- ============================================================================= | |
| CREATE TABLE market_data_injected ( | |
| timestamp TIMESTAMP, | |
| symbol SYMBOL, | |
| best_bid DOUBLE, | |
| best_ask DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR; | |
| -- 5 rare symbols at the very beginning of the time range (once each) | |
| INSERT INTO market_data_injected VALUES | |
| ('2025-02-10T07:59:55.000000', 'ZARJPY', 0.5, 0.6), | |
| ('2025-02-10T07:59:56.000000', 'TRYJPY', 0.3, 0.4), | |
| ('2025-02-10T07:59:57.000000', 'MXNJPY', 0.2, 0.3), | |
| ('2025-02-10T07:59:58.000000', 'PLNEUR', 0.1, 0.2), | |
| ('2025-02-10T07:59:59.000000', 'HUFEUR', 0.05, 0.06); | |
| -- 100M common rows at 100us spacing | |
| INSERT INTO market_data_injected | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp, | |
| rnd_symbol_zipf( | |
| 'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD', | |
| 'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY', | |
| 1.2 | |
| ) symbol, | |
| rnd_double() * 0.5 + 1.0, | |
| rnd_double() * 0.5 + 1.0 | |
| FROM long_sequence(100_000_000); | |
| CREATE TABLE fx_trades_injected ( | |
| timestamp TIMESTAMP_NS, | |
| symbol SYMBOL, | |
| side SYMBOL, | |
| price DOUBLE, | |
| quantity DOUBLE | |
| ) TIMESTAMP(timestamp) PARTITION BY HOUR | |
| DEDUP UPSERT KEYS(timestamp, symbol); | |
| -- 50K rows, 200ms spacing. ~5% trades use rare symbols. | |
| INSERT INTO fx_trades_injected | |
| SELECT | |
| timestamp_sequence('2025-02-10T08:01:00.000000000', 200_000_000) timestamp, | |
| CASE | |
| WHEN x % 20 = 0 THEN rnd_symbol('ZARJPY', 'TRYJPY', 'MXNJPY', 'PLNEUR', 'HUFEUR') | |
| ELSE rnd_symbol_zipf( | |
| 'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD', | |
| 'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY', | |
| 1.2 | |
| ) | |
| END symbol, | |
| rnd_symbol('BUY', 'SELL') side, | |
| 1.0 + rnd_double() * 0.5, | |
| round(rnd_double() * 5_000_000 + 100_000, 2) | |
| FROM long_sequence(50_000); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment