Skip to content

Instantly share code, notes, and snippets.

@mzhang77
Created January 19, 2026 14:44
Show Gist options
  • Select an option

  • Save mzhang77/d5c964140e2fdab52b6e90f7b1495a9e to your computer and use it in GitHub Desktop.

Select an option

Save mzhang77/d5c964140e2fdab52b6e90f7b1495a9e to your computer and use it in GitHub Desktop.
import pymysql
import random
import time
# --- 数据库配置 ---
DB_CONFIG = {
'host': '127.0.0.1', # 默认为本地,如果 TiDB 在远程请修改 IP
'port': 4000,
'user': 'root',
'password': '',
'database': 'test',
'autocommit': True
}
# --- 数据量配置 ---
TOTAL_ROWS = 1_000_000 # 100万行
BATCH_SIZE = 5000 # 每次插入5000行
def get_connection():
return pymysql.connect(**DB_CONFIG)
def generate_batch(start_id, size):
"""生成一批数据"""
batch_data = []
for i in range(size):
curr_id = start_id + i
# 50% 概率 a=1 (高频), 否则 2-1000
col_a = 1 if random.random() < 0.5 else random.randint(2, 1000)
# 50% 概率 b=1 (高频), 否则 2-1000
col_b = 1 if random.random() < 0.5 else random.randint(2, 1000)
# c 是高基数随机值 (排序键)
col_c = random.randint(0, 1_000_000_000)
# 填充字段
padding = 'x' * 50
batch_data.append((curr_id, col_a, col_b, col_c, padding))
return batch_data
def main():
conn = get_connection()
cursor = conn.cursor()
print(f"Connecting to TiDB at {DB_CONFIG['host']}:{DB_CONFIG['port']}...")
try:
# 1. 重建表结构
print("Recreating table 't'...")
cursor.execute("DROP TABLE IF EXISTS t")
create_sql = """
CREATE TABLE t (
id INT PRIMARY KEY,
a INT,
b INT,
c INT,
padding VARCHAR(100)
)
"""
cursor.execute(create_sql)
# 2. 批量插入数据
print(f"Start inserting {TOTAL_ROWS} rows...")
start_time = time.time()
for i in range(0, TOTAL_ROWS, BATCH_SIZE):
batch_data = generate_batch(i + 1, BATCH_SIZE)
insert_sql = "INSERT INTO t (id, a, b, c, padding) VALUES (%s, %s, %s, %s, %s)"
cursor.executemany(insert_sql, batch_data)
# 打印进度
if (i + BATCH_SIZE) % 100_000 == 0:
elapsed = time.time() - start_time
print(f" Inserted {i + BATCH_SIZE} rows... ({elapsed:.2f}s)")
print(f"Data insertion complete. Total time: {time.time() - start_time:.2f}s")
# 3. 创建索引 (在数据插入后创建通常更快)
print("Creating index idx_a_c on (a, c)...")
cursor.execute("CREATE INDEX idx_a_c ON t(a, c)")
print("Creating index idx_b_c on (b, c)...")
cursor.execute("CREATE INDEX idx_b_c ON t(b, c)")
print("Indexes created.")
# 4. 简单验证
cursor.execute("SELECT count(*) FROM t")
count = cursor.fetchone()[0]
print(f"Verified: Table 't' now has {count} rows.")
# 5. 模拟一次查询统计 (可选)
print("\nTest Query Preview:")
query = "SELECT * FROM t WHERE (a = 1 OR b = 1) ORDER BY c LIMIT 5"
print(f"Running: {query}")
t0 = time.time()
cursor.execute(query)
rows = cursor.fetchall()
t1 = time.time()
for r in rows:
print(f" id={r[0]}, a={r[1]}, b={r[2]}, c={r[3]}")
print(f"Query executed in {t1 - t0:.4f}s (Note: First run might be cold)")
except Exception as e:
print(f"Error: {e}")
finally:
cursor.close()
conn.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment