nvidia-smi topo -m
node17_mlx_rename.sh
# /usr/local/bin/rdma-rename.sh
# /lib/systemd/system/rdma-rename.service
systemctl daemon-reload
systemctl enable rdma-rename.service| #!/bin/bash | |
| # 1. 临时重命名有冲突的设备 (防止 File exists 错误) | |
| # 使用 2>/dev/null 忽略那些可能不存在或已经改名的错误提示,不存在的设备不生效 | |
| rdma dev set mlx5_0 name tmp_mlx5_0 2>/dev/null | |
| rdma dev set mlx5_1 name tmp_mlx5_1 2>/dev/null | |
| rdma dev set mlx5_2 name tmp_mlx5_2 2>/dev/null | |
| rdma dev set mlx5_3 name tmp_mlx5_3 2>/dev/null | |
| rdma dev set mlx5_6 name tmp_mlx5_6 2>/dev/null | |
| rdma dev set mlx5_7 name tmp_mlx5_7 2>/dev/null | |
| rdma dev set mlx5_8 name tmp_mlx5_8 2>/dev/null | |
| rdma dev set mlx5_9 name tmp_mlx5_9 2>/dev/null | |
| # 2.重命名亲和卡 | |
| # 2.1 GPU0/GPU1 | |
| rdma dev set tmp_mlx5_0 name mlx5_0 | |
| # 2.2 GPU2/GPU3 | |
| rdma dev set tmp_mlx5_1 name mlx5_2 | |
| # 2.3 GPU4/GPU5 | |
| rdma dev set tmp_mlx5_4 name mlx5_6 | |
| # 2.4 GPU6/GPU7 | |
| rdma dev set tmp_mlx5_5 name mlx5_8 | |
| # 3.重命名IP卡 | |
| rdma dev set tmp_mlx5_2 name mlx5_4 | |
| rdma dev set tmp_mlx5_3 name mlx5_5 |