Skip to content

Instantly share code, notes, and snippets.

@puzpuzpuz
Created March 13, 2026 14:28
Show Gist options
  • Select an option

  • Save puzpuzpuz/e5989707b264ca6b1d620907aa8f2248 to your computer and use it in GitHub Desktop.

Select an option

Save puzpuzpuz/e5989707b264ca6b1d620907aa8f2248 to your computer and use it in GitHub Desktop.
Test data distributions for HORIZON JOIN adaptive scan benchmarking
-- =============================================================================
-- Test data distributions for HORIZON JOIN adaptive scan benchmarking.
--
-- Each scenario generates fx_trades_* and market_data_* with different symbol
-- cardinality, gap sizes, and key distributions to exercise the adaptive
-- backward-to-forward scan switch.
--
-- Benchmark query template (replace suffixes):
--
-- SELECT t.symbol, h.offset, avg(m.best_bid), avg(m.best_ask), count(*)
-- FROM fx_trades_<suffix> t
-- HORIZON JOIN market_data_<suffix> m ON (t.symbol = m.symbol)
-- RANGE FROM -5s TO 5s STEP 1s AS h
-- GROUP BY t.symbol, h.offset
-- ORDER BY t.symbol, h.offset;
--
-- =============================================================================
-- =============================================================================
-- SCENARIO 1: Baseline dense (K=10, gap~10)
-- Both tables have similar density. Backward-only is trivially cheap.
-- No adaptive switch expected.
-- =============================================================================
CREATE TABLE market_data_dense (
timestamp TIMESTAMP,
symbol SYMBOL,
best_bid DOUBLE,
best_ask DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR;
-- 100M rows, 100us spacing, 10 symbols. Spans ~2.78 hours.
INSERT INTO market_data_dense
SELECT
timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp,
rnd_symbol_zipf(
'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD',
'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY',
1.2
) symbol,
rnd_double() * 0.5 + 1.0,
rnd_double() * 0.5 + 1.0
FROM long_sequence(100_000_000);
CREATE TABLE fx_trades_dense (
timestamp TIMESTAMP_NS,
symbol SYMBOL,
side SYMBOL,
price DOUBLE,
quantity DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR
DEDUP UPSERT KEYS(timestamp, symbol);
-- 1M rows, 10us spacing, 15 symbols. Gap per trade ~10 market_data rows.
INSERT INTO fx_trades_dense
SELECT
timestamp_sequence('2025-02-10T08:01:00.000000000', 10_000) timestamp,
rnd_symbol_zipf(
'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD',
'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY',
'AUDJPY', 'EURAUD', 'EURCHF', 'GBPAUD', 'CADCHF',
1.2
) symbol,
rnd_symbol('BUY', 'SELL') side,
1.0 + rnd_double() * 0.5,
round(rnd_double() * 5_000_000 + 100_000, 2)
FROM long_sequence(1_000_000);
-- =============================================================================
-- SCENARIO 2: Sparse LHS, few symbols (K=15, gap~2000)
-- Large gap but small K. Backward-only is still efficient.
-- No adaptive switch expected.
-- =============================================================================
CREATE TABLE market_data_sparse (
timestamp TIMESTAMP,
symbol SYMBOL,
best_bid DOUBLE,
best_ask DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR;
-- 100M rows, 100us spacing, 10 symbols
INSERT INTO market_data_sparse
SELECT
timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp,
rnd_symbol_zipf(
'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD',
'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY',
1.2
) symbol,
rnd_double() * 0.5 + 1.0,
rnd_double() * 0.5 + 1.0
FROM long_sequence(100_000_000);
CREATE TABLE fx_trades_sparse (
timestamp TIMESTAMP_NS,
symbol SYMBOL,
side SYMBOL,
price DOUBLE,
quantity DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR
DEDUP UPSERT KEYS(timestamp, symbol);
-- 50K rows, 200ms spacing, 15 symbols. Gap per trade ~2,000 market_data rows.
INSERT INTO fx_trades_sparse
SELECT
timestamp_sequence('2025-02-10T08:01:00.000000000', 200_000_000) timestamp,
rnd_symbol_zipf(
'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD',
'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY',
'AUDJPY', 'EURAUD', 'EURCHF', 'GBPAUD', 'CADCHF',
1.2
) symbol,
rnd_symbol('BUY', 'SELL') side,
1.0 + rnd_double() * 0.5,
round(rnd_double() * 5_000_000 + 100_000, 2)
FROM long_sequence(50_000);
-- =============================================================================
-- SCENARIO 3: Rare symbols (K=200 with steep Zipf, gap~2000)
-- 200 symbols with Zipf(2.0). The ~185 rarest symbols each appear only
-- hundreds of times in 100M rows, causing deep backward scans.
-- fx_trades uses flatter Zipf(0.5) to ensure rare symbols appear in trades.
-- Adaptive switch expected after deep backward scans for rare keys.
-- =============================================================================
CREATE TABLE market_data_rare (
timestamp TIMESTAMP,
symbol SYMBOL,
best_bid DOUBLE,
best_ask DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR;
-- 100M rows, 100us spacing, 200 symbols with steep Zipf.
-- Rarest symbols (~sym200) appear ~1,500 times in 100M rows (gap ~66,000).
INSERT INTO market_data_rare
SELECT
timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp,
rnd_symbol_zipf(200, 2.0) symbol,
rnd_double() * 0.5 + 1.0,
rnd_double() * 0.5 + 1.0
FROM long_sequence(100_000_000);
CREATE TABLE fx_trades_rare (
timestamp TIMESTAMP_NS,
symbol SYMBOL,
side SYMBOL,
price DOUBLE,
quantity DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR
DEDUP UPSERT KEYS(timestamp, symbol);
-- 50K rows, 200ms spacing, 200 symbols with flat Zipf.
-- Flat Zipf(0.5) ensures rare symbols appear in trades (~50-100 times each).
INSERT INTO fx_trades_rare
SELECT
timestamp_sequence('2025-02-10T08:01:00.000000000', 200_000_000) timestamp,
rnd_symbol_zipf(200, 0.5) symbol,
rnd_symbol('BUY', 'SELL') side,
1.0 + rnd_double() * 0.5,
round(rnd_double() * 5_000_000 + 100_000, 2)
FROM long_sequence(50_000);
-- =============================================================================
-- SCENARIO 4: High cardinality equities (K=5000, gap~2000)
-- Simulates equity tick data with thousands of tickers.
-- Even without rare keys, K=5000 means each position change requires
-- scanning thousands of rows backward to populate the key map.
-- Adaptive switch expected due to Zipf tail creating deep scans.
-- =============================================================================
CREATE TABLE market_data_equity (
timestamp TIMESTAMP,
symbol SYMBOL,
best_bid DOUBLE,
best_ask DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR;
-- 100M rows, 100us spacing, 5000 symbols
INSERT INTO market_data_equity
SELECT
timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp,
rnd_symbol_zipf(5_000, 1.5) symbol,
rnd_double() * 100.0 + 50.0,
rnd_double() * 100.0 + 50.0
FROM long_sequence(100_000_000);
CREATE TABLE fx_trades_equity (
timestamp TIMESTAMP_NS,
symbol SYMBOL,
side SYMBOL,
price DOUBLE,
quantity DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR
DEDUP UPSERT KEYS(timestamp, symbol);
-- 50K rows, 200ms spacing, 5000 symbols with flatter Zipf
INSERT INTO fx_trades_equity
SELECT
timestamp_sequence('2025-02-10T08:01:00.000000000', 200_000_000) timestamp,
rnd_symbol_zipf(5_000, 1.0) symbol,
rnd_symbol('BUY', 'SELL') side,
50.0 + rnd_double() * 100.0,
round(rnd_double() * 10_000 + 100, 2)
FROM long_sequence(50_000);
-- =============================================================================
-- SCENARIO 5: Injected rare keys (K=10 common + 5 rare, gap~2000)
-- Common FX symbols plus 5 exotic symbols planted only at the very beginning
-- of market_data. Every trade with an exotic symbol triggers a backward scan
-- across nearly the entire table.
-- Adaptive switch expected very quickly.
-- =============================================================================
CREATE TABLE market_data_injected (
timestamp TIMESTAMP,
symbol SYMBOL,
best_bid DOUBLE,
best_ask DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR;
-- 5 rare symbols at the very beginning of the time range (once each)
INSERT INTO market_data_injected VALUES
('2025-02-10T07:59:55.000000', 'ZARJPY', 0.5, 0.6),
('2025-02-10T07:59:56.000000', 'TRYJPY', 0.3, 0.4),
('2025-02-10T07:59:57.000000', 'MXNJPY', 0.2, 0.3),
('2025-02-10T07:59:58.000000', 'PLNEUR', 0.1, 0.2),
('2025-02-10T07:59:59.000000', 'HUFEUR', 0.05, 0.06);
-- 100M common rows at 100us spacing
INSERT INTO market_data_injected
SELECT
timestamp_sequence('2025-02-10T08:00:00.000000', 100) timestamp,
rnd_symbol_zipf(
'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD',
'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY',
1.2
) symbol,
rnd_double() * 0.5 + 1.0,
rnd_double() * 0.5 + 1.0
FROM long_sequence(100_000_000);
CREATE TABLE fx_trades_injected (
timestamp TIMESTAMP_NS,
symbol SYMBOL,
side SYMBOL,
price DOUBLE,
quantity DOUBLE
) TIMESTAMP(timestamp) PARTITION BY HOUR
DEDUP UPSERT KEYS(timestamp, symbol);
-- 50K rows, 200ms spacing. ~5% trades use rare symbols.
INSERT INTO fx_trades_injected
SELECT
timestamp_sequence('2025-02-10T08:01:00.000000000', 200_000_000) timestamp,
CASE
WHEN x % 20 = 0 THEN rnd_symbol('ZARJPY', 'TRYJPY', 'MXNJPY', 'PLNEUR', 'HUFEUR')
ELSE rnd_symbol_zipf(
'EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDCAD',
'USDCHF', 'NZDUSD', 'EURGBP', 'EURJPY', 'GBPJPY',
1.2
)
END symbol,
rnd_symbol('BUY', 'SELL') side,
1.0 + rnd_double() * 0.5,
round(rnd_double() * 5_000_000 + 100_000, 2)
FROM long_sequence(50_000);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment