Created
January 19, 2026 14:44
-
-
Save mzhang77/d5c964140e2fdab52b6e90f7b1495a9e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pymysql | |
| import random | |
| import time | |
| # --- 数据库配置 --- | |
| DB_CONFIG = { | |
| 'host': '127.0.0.1', # 默认为本地,如果 TiDB 在远程请修改 IP | |
| 'port': 4000, | |
| 'user': 'root', | |
| 'password': '', | |
| 'database': 'test', | |
| 'autocommit': True | |
| } | |
| # --- 数据量配置 --- | |
| TOTAL_ROWS = 1_000_000 # 100万行 | |
| BATCH_SIZE = 5000 # 每次插入5000行 | |
| def get_connection(): | |
| return pymysql.connect(**DB_CONFIG) | |
| def generate_batch(start_id, size): | |
| """生成一批数据""" | |
| batch_data = [] | |
| for i in range(size): | |
| curr_id = start_id + i | |
| # 50% 概率 a=1 (高频), 否则 2-1000 | |
| col_a = 1 if random.random() < 0.5 else random.randint(2, 1000) | |
| # 50% 概率 b=1 (高频), 否则 2-1000 | |
| col_b = 1 if random.random() < 0.5 else random.randint(2, 1000) | |
| # c 是高基数随机值 (排序键) | |
| col_c = random.randint(0, 1_000_000_000) | |
| # 填充字段 | |
| padding = 'x' * 50 | |
| batch_data.append((curr_id, col_a, col_b, col_c, padding)) | |
| return batch_data | |
| def main(): | |
| conn = get_connection() | |
| cursor = conn.cursor() | |
| print(f"Connecting to TiDB at {DB_CONFIG['host']}:{DB_CONFIG['port']}...") | |
| try: | |
| # 1. 重建表结构 | |
| print("Recreating table 't'...") | |
| cursor.execute("DROP TABLE IF EXISTS t") | |
| create_sql = """ | |
| CREATE TABLE t ( | |
| id INT PRIMARY KEY, | |
| a INT, | |
| b INT, | |
| c INT, | |
| padding VARCHAR(100) | |
| ) | |
| """ | |
| cursor.execute(create_sql) | |
| # 2. 批量插入数据 | |
| print(f"Start inserting {TOTAL_ROWS} rows...") | |
| start_time = time.time() | |
| for i in range(0, TOTAL_ROWS, BATCH_SIZE): | |
| batch_data = generate_batch(i + 1, BATCH_SIZE) | |
| insert_sql = "INSERT INTO t (id, a, b, c, padding) VALUES (%s, %s, %s, %s, %s)" | |
| cursor.executemany(insert_sql, batch_data) | |
| # 打印进度 | |
| if (i + BATCH_SIZE) % 100_000 == 0: | |
| elapsed = time.time() - start_time | |
| print(f" Inserted {i + BATCH_SIZE} rows... ({elapsed:.2f}s)") | |
| print(f"Data insertion complete. Total time: {time.time() - start_time:.2f}s") | |
| # 3. 创建索引 (在数据插入后创建通常更快) | |
| print("Creating index idx_a_c on (a, c)...") | |
| cursor.execute("CREATE INDEX idx_a_c ON t(a, c)") | |
| print("Creating index idx_b_c on (b, c)...") | |
| cursor.execute("CREATE INDEX idx_b_c ON t(b, c)") | |
| print("Indexes created.") | |
| # 4. 简单验证 | |
| cursor.execute("SELECT count(*) FROM t") | |
| count = cursor.fetchone()[0] | |
| print(f"Verified: Table 't' now has {count} rows.") | |
| # 5. 模拟一次查询统计 (可选) | |
| print("\nTest Query Preview:") | |
| query = "SELECT * FROM t WHERE (a = 1 OR b = 1) ORDER BY c LIMIT 5" | |
| print(f"Running: {query}") | |
| t0 = time.time() | |
| cursor.execute(query) | |
| rows = cursor.fetchall() | |
| t1 = time.time() | |
| for r in rows: | |
| print(f" id={r[0]}, a={r[1]}, b={r[2]}, c={r[3]}") | |
| print(f"Query executed in {t1 - t0:.4f}s (Note: First run might be cold)") | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| finally: | |
| cursor.close() | |
| conn.close() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment