RADIUS(Remote Authentication and Dial-In User Service)是做拨号用户接入认证和服务请求认证的网络协议。RADIUS提供中心式认证、鉴权和计费(AAA)能力,用于管理接入用户使用网络资源。RADIUS允许使用集中式的数据库来保存所有用户的配置信息,以供所有用户共享使用。
# yum install -y freeradius freeradius-utils freeradius-mysql
# systemctl start radiusd.service
# systemctl enable radiusd.service
# mysql -u root -p radius < /etc/raddb/mods-config/sql/main/mysql/schema.sql
# vim /etc/raddb/mods-available/sql
# cat /etc/raddb/mods-available/sql | grep -v ^#
sql {
# The sub-module to use to execute queries. This should match
# the database you're attempting to connect to.
#
# * rlm_sql_mysql
# * rlm_sql_mssql
# * rlm_sql_oracle
# * rlm_sql_postgresql
# * rlm_sql_sqlite
# * rlm_sql_null (log queries to disk)
#
driver = "rlm_sql_mysql"
# The dialect of SQL you want to use, this should usually match
# the driver you selected above.
#
# If you're using rlm_sql_null, then it should be the type of
# database the logged queries are going to be executed against.
dialect = "mysql"
# Connection info:
#
server = "localhost"
port = 3306
login = "radius"
password = "radius"
# Database table configuration for everything except Oracle
radius_db = "radius"
# chgrp -h radiusd /etc/raddb/mods-available/sql
# systemctl restart radiusd.service
## NIC map
bm-2204kzq:252982:252982 [*] NCCL INFO NCCL_SOCKET_IFNAME set by environment to bond0
bm-2204kzq:252982:252982 [*] NCCL INFO Bootstrap : Using bond0:172.17.0.215<0>
bm-2204kzq:252982:252982 [*] NCCL INFO NCCL version 2.22.3+cuda12.6
bm-2204kzq:252985:253055 [*] NCCL INFO NET/IB : Using [0]mlx5_2:1/RoCE [1]mlx5_3:1/RoCE [2]mlx5_4:1/RoCE [3]mlx5_0:1/RoCE [RO]; OOB bond0:172.17.0.215<0>
bm-2204qhn:253837:253837 [*] NCCL INFO NCCL_SOCKET_IFNAME set by environment to bond0
bm-2204qhn:253837:253837 [*] NCCL INFO Bootstrap : Using bond0:172.17.0.81<0>
bm-2204qhn:253837:253837 [*] NCCL INFO NCCL version 2.22.3+cuda12.6
bm-2204qhn:253840:253908 [*] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_2:1/RoCE [2]mlx5_3:1/RoCE [3]mlx5_4:1/RoCE [RO]; OOB bond0:172.17.0.81<0>
### ChannelNum:16
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 00/16 : 0 7 5 6 4 3 1 2 8 15 13 14 12 11 9 10
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 01/16 : 0 7 5 6 4 3 1 10 8 15 13 14 12 11 9 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 02/16 : 0 7 5 6 12 11 9 10 8 15 13 14 4 3 1 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 03/16 : 0 7 5 14 12 11 9 10 8 15 13 6 4 3 1 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 04/16 : 0 7 5 6 4 3 1 2 8 15 13 14 12 11 9 10
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 05/16 : 0 7 5 6 4 3 1 10 8 15 13 14 12 11 9 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 06/16 : 0 7 5 6 12 11 9 10 8 15 13 14 4 3 1 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 07/16 : 0 7 5 14 12 11 9 10 8 15 13 6 4 3 1 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 08/16 : 0 7 5 6 4 3 1 2 8 15 13 14 12 11 9 10
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 09/16 : 0 7 5 6 4 3 1 10 8 15 13 14 12 11 9 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 10/16 : 0 7 5 6 12 11 9 10 8 15 13 14 4 3 1 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 11/16 : 0 7 5 14 12 11 9 10 8 15 13 6 4 3 1 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 12/16 : 0 7 5 6 4 3 1 2 8 15 13 14 12 11 9 10
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 13/16 : 0 7 5 6 4 3 1 10 8 15 13 14 12 11 9 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 14/16 : 0 7 5 6 12 11 9 10 8 15 13 14 4 3 1 2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 15/16 : 0 7 5 14 12 11 9 10 8 15 13 6 4 3 1 2
## Channel C0
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 00/16 : 0 7 5 6 4 3 1 2 8 15 13 14 12 11 9 10
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 00/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 00/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 00/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 00/0 : 2[2] -> 8[0] [send] via NET/IB/3(0)/GDRDMA mlx5_0:1/RoCE
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 00/0 : 2[2] -> 8[0] [receive] via NET/IB/0/GDRDMA mlx5_0:1/RoCE
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 00/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 00/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 00/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 00/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 00/0 : 10[2] -> 0[0] [send] via NET/IB/0(8)/GDRDMA mlx5_0:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 00/0 : 10[2] -> 0[0] [receive] via NET/IB/3/GDRDMA mlx5_0:1/RoCE
## Channel C1
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 01/16 : 0 7 5 6 4 3 1 10 8 15 13 14 12 11 9 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 01/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 01/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 01/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 01/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 01/0 : 1[1] -> 10[2] [send] via NET/IB/0(2)/GDRDMA mlx5_2:1/RoCE
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 01/0 : 1[1] -> 10[2] [receive] via NET/IB/1/GDRDMA mlx5_2:1/RoCE
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 01/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 01/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 01/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 01/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 01/0 : 9[1] -> 2[2] [send] via NET/IB/1(10)/GDRDMA mlx5_2:1/RoCE
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 01/0 : 9[1] -> 2[2] [receive] via NET/IB/0/GDRDMA mlx5_2:1/RoCE
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 01/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C2
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 02/16 : 0 7 5 6 12 11 9 10 8 15 13 14 4 3 1 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 02/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 02/0 : 6[6] -> 12[4] [send] via NET/IB/1(4)/GDRDMA mlx5_3:1/RoCE
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 02/0 : 6[6] -> 12[4] [receive] via NET/IB/2/GDRDMA mlx5_3:1/RoCE
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 02/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 02/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 02/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 02/0 : 14[6] -> 4[4] [send] via NET/IB/2(12)/GDRDMA mlx5_3:1/RoCE
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 02/0 : 14[6] -> 4[4] [receive] via NET/IB/1/GDRDMA mlx5_3:1/RoCE
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 02/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 02/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C3
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 03/16 : 0 7 5 14 12 11 9 10 8 15 13 6 4 3 1 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 03/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 03/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 03/0 : 5[5] -> 14[6] [send] via NET/IB/2(6)/GDRDMA mlx5_4:1/RoCE
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 03/0 : 5[5] -> 14[6] [receive] via NET/IB/3/GDRDMA mlx5_4:1/RoCE
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 03/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 03/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 03/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 03/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 03/0 : 13[5] -> 6[6] [send] via NET/IB/3(14)/GDRDMA mlx5_4:1/RoCE
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 03/0 : 13[5] -> 6[6] [receive] via NET/IB/2/GDRDMA mlx5_4:1/RoCE
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 03/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 03/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 03/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C4
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 04/16 : 0 7 5 6 4 3 1 2 8 15 13 14 12 11 9 10
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 04/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 04/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 04/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 04/0 : 2[2] -> 8[0] [send] via NET/IB/3(0)/GDRDMA mlx5_0:1/RoCE
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 04/0 : 2[2] -> 8[0] [receive] via NET/IB/0/GDRDMA mlx5_0:1/RoCE
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 04/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 04/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 04/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 04/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 04/0 : 10[2] -> 0[0] [send] via NET/IB/0(8)/GDRDMA mlx5_0:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 04/0 : 10[2] -> 0[0] [receive] via NET/IB/3/GDRDMA mlx5_0:1/RoCE
## Channel C5
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 05/16 : 0 7 5 6 4 3 1 10 8 15 13 14 12 11 9 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 05/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 05/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 05/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 05/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 05/0 : 1[1] -> 10[2] [send] via NET/IB/0(2)/GDRDMA mlx5_2:1/RoCE
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 05/0 : 1[1] -> 10[2] [receive] via NET/IB/1/GDRDMA mlx5_2:1/RoCE
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 05/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 05/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 05/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 05/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 05/0 : 9[1] -> 2[2] [send] via NET/IB/1(10)/GDRDMA mlx5_2:1/RoCE
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 05/0 : 9[1] -> 2[2] [receive] via NET/IB/0/GDRDMA mlx5_2:1/RoCE
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 05/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C6
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 06/16 : 0 7 5 6 12 11 9 10 8 15 13 14 4 3 1 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 06/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 06/0 : 6[6] -> 12[4] [send] via NET/IB/1(4)/GDRDMA mlx5_3:1/RoCE
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 06/0 : 6[6] -> 12[4] [receive] via NET/IB/2/GDRDMA mlx5_3:1/RoCE
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 06/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 06/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 06/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 06/0 : 14[6] -> 4[4] [send] via NET/IB/2(12)/GDRDMA mlx5_3:1/RoCE
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 06/0 : 14[6] -> 4[4] [receive] via NET/IB/1/GDRDMA mlx5_3:1/RoCE
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 06/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 06/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C7
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 07/16 : 0 7 5 14 12 11 9 10 8 15 13 6 4 3 1 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 07/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 07/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 07/0 : 5[5] -> 14[6] [send] via NET/IB/2(6)/GDRDMA mlx5_4:1/RoCE
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 07/0 : 5[5] -> 14[6] [receive] via NET/IB/3/GDRDMA mlx5_4:1/RoCE
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 07/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 07/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 07/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 07/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 07/0 : 13[5] -> 6[6] [send] via NET/IB/3(14)/GDRDMA mlx5_4:1/RoCE
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 07/0 : 13[5] -> 6[6] [receive] via NET/IB/2/GDRDMA mlx5_4:1/RoCE
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 07/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 07/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 07/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C8
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 08/16 : 0 7 5 6 4 3 1 2 8 15 13 14 12 11 9 10
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 08/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 08/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 08/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 08/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 08/0 : 2[2] -> 8[0] [send] via NET/IB/3(0)/GDRDMA mlx5_0:1/RoCE
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 08/0 : 2[2] -> 8[0] [receive] via NET/IB/0/GDRDMA mlx5_0:1/RoCE
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 08/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 08/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 08/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 08/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 08/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 08/0 : 10[2] -> 0[0] [send] via NET/IB/0(8)/GDRDMA mlx5_0:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 08/0 : 10[2] -> 0[0] [receive] via NET/IB/3/GDRDMA mlx5_0:1/RoCE
## Channel C9
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 09/16 : 0 7 5 6 4 3 1 10 8 15 13 14 12 11 9 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 09/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 09/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 09/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 09/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 09/0 : 1[1] -> 10[2] [send] via NET/IB/0(2)/GDRDMA mlx5_2:1/RoCE
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 09/0 : 1[1] -> 10[2] [receive] via NET/IB/1/GDRDMA mlx5_2:1/RoCE
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 09/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 09/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 09/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 09/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 09/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 09/0 : 9[1] -> 2[2] [send] via NET/IB/1(10)/GDRDMA mlx5_2:1/RoCE
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 09/0 : 9[1] -> 2[2] [receive] via NET/IB/0/GDRDMA mlx5_2:1/RoCE
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 09/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C10
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 10/16 : 0 7 5 6 12 11 9 10 8 15 13 14 4 3 1 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 10/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 10/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 10/0 : 6[6] -> 12[4] [send] via NET/IB/1(4)/GDRDMA mlx5_3:1/RoCE
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 10/0 : 6[6] -> 12[4] [receive] via NET/IB/2/GDRDMA mlx5_3:1/RoCE
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 10/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 10/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 10/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 10/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 10/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 10/0 : 14[6] -> 4[4] [send] via NET/IB/2(12)/GDRDMA mlx5_3:1/RoCE
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 10/0 : 14[6] -> 4[4] [receive] via NET/IB/1/GDRDMA mlx5_3:1/RoCE
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 10/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 10/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C11
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 11/16 : 0 7 5 14 12 11 9 10 8 15 13 6 4 3 1 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 11/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 11/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 11/0 : 5[5] -> 14[6] [send] via NET/IB/2(6)/GDRDMA mlx5_4:1/RoCE
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 11/0 : 5[5] -> 14[6] [receive] via NET/IB/3/GDRDMA mlx5_4:1/RoCE
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 11/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 11/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 11/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 11/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 11/0 : 13[5] -> 6[6] [send] via NET/IB/3(14)/GDRDMA mlx5_4:1/RoCE
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 11/0 : 13[5] -> 6[6] [receive] via NET/IB/2/GDRDMA mlx5_4:1/RoCE
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 11/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 11/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 11/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C12
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 12/16 : 0 7 5 6 4 3 1 2 8 15 13 14 12 11 9 10
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 12/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 12/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 12/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 12/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 12/0 : 2[2] -> 8[0] [send] via NET/IB/3(0)/GDRDMA mlx5_0:1/RoCE
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 12/0 : 2[2] -> 8[0] [receive] via NET/IB/0/GDRDMA mlx5_0:1/RoCE
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 12/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 12/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 12/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 12/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 12/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 12/0 : 10[2] -> 0[0] [send] via NET/IB/0(8)/GDRDMA mlx5_0:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 12/0 : 10[2] -> 0[0] [receive] via NET/IB/3/GDRDMA mlx5_0:1/RoCE
## Channel C13
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 13/16 : 0 7 5 6 4 3 1 10 8 15 13 14 12 11 9 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 13/0 : 2[2] -> 0[0] via P2P/CUMEM
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 13/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 13/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 13/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 13/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 13/0 : 1[1] -> 10[2] [send] via NET/IB/0(2)/GDRDMA mlx5_2:1/RoCE
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 13/0 : 1[1] -> 10[2] [receive] via NET/IB/1/GDRDMA mlx5_2:1/RoCE
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 13/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 13/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 13/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 13/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 13/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 13/0 : 9[1] -> 2[2] [send] via NET/IB/1(10)/GDRDMA mlx5_2:1/RoCE
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 13/0 : 9[1] -> 2[2] [receive] via NET/IB/0/GDRDMA mlx5_2:1/RoCE
## Channel C14
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 14/16 : 0 7 5 6 12 11 9 10 8 15 13 14 4 3 1 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 14/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 14/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 14/0 : 6[6] -> 12[4] [send] via NET/IB/1(4)/GDRDMA mlx5_3:1/RoCE
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 14/0 : 6[6] -> 12[4] [receive] via NET/IB/2/GDRDMA mlx5_3:1/RoCE
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 14/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 14/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 14/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 14/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 14/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 14/0 : 14[6] -> 4[4] [send] via NET/IB/2(12)/GDRDMA mlx5_3:1/RoCE
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 14/0 : 14[6] -> 4[4] [receive] via NET/IB/1/GDRDMA mlx5_3:1/RoCE
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 14/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 14/0 : 2[2] -> 0[0] via P2P/CUMEM
## Channel C15
bm-2204kzq:252978:253054 [0] NCCL INFO Channel 15/16 : 0 7 5 14 12 11 9 10 8 15 13 6 4 3 1 2
bm-2204kzq:
[0]mlx5_2:1/RoCE
[1]mlx5_3:1/RoCE
[2]mlx5_4:1/RoCE
[3]mlx5_0:1/RoCE
bm-2204qhn:
[0]mlx5_0:1/RoCE
[1]mlx5_2:1/RoCE
[2]mlx5_3:1/RoCE
[3]mlx5_4:1/RoCE
bm-2204kzq:252978:253115 [0] NCCL INFO Channel 15/0 : 0[0] -> 7[7] via P2P/CUMEM
bm-2204kzq:252985:253113 [7] NCCL INFO Channel 15/0 : 7[7] -> 5[5] via P2P/CUMEM
bm-2204kzq:252983:253114 [5] NCCL INFO Channel 15/0 : 5[5] -> 14[6] [send] via NET/IB/2(6)/GDRDMA mlx5_4:1/RoCE
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 15/0 : 5[5] -> 14[6] [receive] via NET/IB/3/GDRDMA mlx5_4:1/RoCE
bm-2204qhn:253840:253973 [6] NCCL INFO Channel 15/0 : 14[6] -> 12[4] via P2P/CUMEM
bm-2204qhn:253838:253972 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM
bm-2204qhn:253837:253967 [3] NCCL INFO Channel 15/0 : 11[3] -> 9[1] via P2P/CUMEM
bm-2204qhn:253835:253971 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM
bm-2204qhn:253836:253974 [2] NCCL INFO Channel 15/0 : 10[2] -> 8[0] via P2P/CUMEM
bm-2204qhn:253834:253970 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM
bm-2204qhn:253841:253968 [7] NCCL INFO Channel 15/0 : 15[7] -> 13[5] via P2P/CUMEM
bm-2204qhn:253839:253969 [5] NCCL INFO Channel 15/0 : 13[5] -> 6[6] [send] via NET/IB/3(14)/GDRDMA mlx5_4:1/RoCE
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 15/0 : 13[5] -> 6[6] [receive] via NET/IB/2/GDRDMA mlx5_4:1/RoCE
bm-2204kzq:252984:253117 [6] NCCL INFO Channel 15/0 : 6[6] -> 4[4] via P2P/CUMEM
bm-2204kzq:252982:253118 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM
bm-2204kzq:252981:253116 [3] NCCL INFO Channel 15/0 : 3[3] -> 1[1] via P2P/CUMEM
bm-2204kzq:252979:253119 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
bm-2204kzq:252980:253120 [2] NCCL INFO Channel 15/0 : 2[2] -> 0[0] via P2P/CUMEM
查看VTEP信息
admin@Leaf1:~$ show evpn status
交换机路由信息
admin@Leaf1:~$ ip route show
下面信息显示Server2的路由转发到了Leaf1
admin@Leaf1:~$ ip neigh show nud all | grep Vlan
admin@Leaf1:~$ sudo bridge fdb|grep vxlan
admin@Leaf1:~$ show ip route vrf Vnet123
查看Leaf2的路由信息
查看VTEP信息
admin@Leaf2:~$ show evpn status
交换机路由信息
admin@Leaf2:~$ ip route show
下面信息显示Server2的路由转发到了Leaf1
admin@Leaf2:~$ ip neigh show nud all | grep Vlan
admin@Leaf2:~$ sudo bridge fdb |grep vxlan
admin@Leaf2:~$ show ip route vrf Vnet123
RDMA(Remote Direct Memory Access)技术是一种基于网络的内存访问技术,它允许内存数据在计算机之间直接传输,无需CPU或操作系统的参与。目前,RDMA技术被广泛应用于超算、AI训练、存储等网络密集型场景。虽然RDMA技术的性能十分可观,但是需要专用的RDMA网卡,为了兼容普通网卡,IBTA提出了一种RDMA的软件实现方式——SoftRoCE。SoftRoCE整体由软件实现,底层使用普通的以太网卡,和硬件方案相比性能较差,但优势在于能用普通以太网卡与RDMA网卡通信。
配置主机
cat > /etc/keepalived/keepalived.conf <<-EOF
! Configuration File for keepalived
! failover E1 and I1 at the same time
vrrp_sync_group G1 {
group {
I1
}
}
! internal
vrrp_instance I1 {
state master
interface br-lan
virtual_router_id 51
priority 101
advert_int 1
virtual_ipaddress {
10.240.4.226/24
}
authentication {
auth_type PASS
auth_pass s3cret
}
nopreempt
}
EOF
配置备机:
cat > /etc/keepalived/keepalived.conf <<-EOF
! Configuration File for keepalived
! failover E1 and I1 at the same time
vrrp_sync_group G1 {
group {
I1
}
}
! internal
vrrp_instance I1 {
state backup
interface br-lan
virtual_router_id 51
priority 50
advert_int 1
virtual_ipaddress {
10.240.4.226/24
}
authentication {
auth_type PASS
auth_pass s3cret
}
nopreempt
} EOF
重新启动服务:
/etc/init.d/keepalived restart
在主路由上面 ip a 命令可以看到 vip 地址已经有了,电脑ping 这个地址可以通了 主机IP 10.240.4.224/24,备机IP 10.240.4.225/24 主机关闭该服务后IP只有10.240.4.224/24,虚拟IP丢失。
root@OpenWrt:/# /etc/init.d/keepalived stop
查看备机加载了虚拟IP:10.240.4.226/24
root@OpenWrt:/# ip a|grep br-lan
br-lan: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default
inet 10.240.4.225/24 brd 10.240.4.255 scope global br-lan
inet 10.240.4.226/24 scope global secondary br-lan
主机重启该服务后虚拟IP重新加载
root@OpenWrt:/# /etc/init.d/keepalived restart 备机同时则虚拟IP丢失 root@OpenWrt:/# ip a|grep br-lan br-lan: mtu 1500 qdisc noqueue state UP group default inet 10.240.4.225/24 brd 10.240.4.255 scope global br-lan
name 表示:设置当前这个 rule 的名称 target 表示:设置防火墙动作,可选值: ACCEPT 许可, REJECT 拒绝, DROP 抛弃 src 表示: 数据源的 zone 域是哪个。可选值: wan / lan src_ip 表示:数据源的 IP 地址是哪个。 src_mac 表示:数据源的 MAC 地址是哪个。 src_port 表示:数据源的端口,可以是一个端口,或一个端口范围,但是必须
同时指定了协议类型 proto 表示: 数据源的协议类型, 可选值: tcp, udp, tcpudp, udplit, icmp, esp, ah, sctp, 或 all 表示全部 dest 表示:数据目标的 zone 域是哪个。可选值: wan / lan dest_ip 表示:数据目标的 IP 地址。 dest_port 表示:数据目标的端口,可以是一个端口,或一个端口范围,但是必须同时指定了协议类型 family 表示:数据的协议族,可选值: ipv4, ipv6, any rule 规则设置可以灵活,比如允许来自 WAN 口的 ping,
例:
config rule
option name Allow-Ping
option src wan
option proto icmp
option icmp_type echo-request
option family ipv4
option target ACCEPT
第五部分内容 端口转发、重定向
# 8080 代理
config redirect
option target 'DNAT'
option name 'port-redirect'
option src 'wan'
option src_dport '8080'
option dest 'lan'
option dest_ip '192.168.40.2'
option dest_port '18080'
# port 2000 ssh login local machine
config redirect
option target 'DNAT'
option name 'port-2000'
list proto 'tcp'
option src 'wan'
option src_dport '2000'
option dest 'lan'
option dest_port '22'
OpenWrt 防火墙允许使用者通过 WAN 口访问特定的端口重定向给局域网的一台电脑设备(比如 WAN 口访问 80 端口(HTTP)将重定向给局域网某台网站服务器)。 端口重定向是在防火墙配置/etc/config/firewall 中定义 redirect 段策略实现的。所有匹配的来源数据将根据目标设置转发到目标主机上。 firewall 配置中可以有多个 redirect 策略,默认是没有开放任何重定向的,如果需要重定向请使用 vi 或 UCI 进行配置。 name 表示:设置当前这个 redirect 的名称 src 表示:转发源的 zone 域,一般转发都是转发从 wan 过来的访问 src_ip 表示:转发源的 IP 地址指定 src_mac 表示:转发源的 MAC 地址指定 src_port 表示:转发源的端口指定 proto 表示: 转发源的协议类型, 可选值: tcp, udp, tcpudp, udplit, icmp, esp, ah, sctp, 或 all 表示全部 dest 表示:转发目标的 zone 域 dest_ip 表示:转发目标的 IP 地址指定 dest_mac 表示:转发目标的 MAC 地址指定 dest_port 表示:转发目标的端口指定 端口重定向的可配置性很灵活。比如我们将 8080 这个端口转发给内网一台服务器的 18080 端口。
第六部分内容 IPV6
config rule
option name 'Allow-DHCPv6'
option src 'wan'
option proto 'udp'
option src_ip 'fc00::/6'
option dest_ip 'fc00::/6'
option dest_port '546'
option family 'ipv6'
option target 'ACCEPT'
config rule
option name 'Allow-MLD'
option src 'wan'
option proto 'icmp'
option src_ip 'fe80::/10'
list icmp_type '130/0'
list icmp_type '131/0'
list icmp_type '132/0'
list icmp_type '143/0'
option family 'ipv6'
option target 'ACCEPT'
config rule
option name 'Allow-ICMPv6-Input'
option src 'wan'
option proto 'icmp'
list icmp_type 'echo-request'
list icmp_type 'echo-reply'
list icmp_type 'destination-unreachable'
list icmp_type 'packet-too-big'
list icmp_type 'time-exceeded'
list icmp_type 'bad-header'
list icmp_type 'unknown-header-type'
list icmp_type 'router-solicitation'
list icmp_type 'neighbour-solicitation'
list icmp_type 'router-advertisement'
list icmp_type 'neighbour-advertisement'
option limit '1000/sec'
option family 'ipv6'
option target 'ACCEPT'
config rule
option name 'Allow-ICMPv6-Forward'
option src 'wan'
option dest '*'
option proto 'icmp'
list icmp_type 'echo-request'
list icmp_type 'echo-reply'
list icmp_type 'destination-unreachable'
list icmp_type 'packet-too-big'
list icmp_type 'time-exceeded'
list icmp_type 'bad-header'
list icmp_type 'unknown-header-type'
option limit '1000/sec'
option family 'ipv6'
option target 'ACCEPT'
第七部分 IPSec
config rule
option name 'Allow-IPSec-ESP'
option src 'wan'
option dest 'lan'
option proto 'esp'
option target 'ACCEPT'
config rule
option name 'Allow-ISAKMP'
option src 'wan'
option dest 'lan'
option dest_port '500'
option proto 'udp'
option target 'ACCEPT'
允许 IPSec tunnel 通过防火墙,分别 IKA SA 、IPSec SA 放行。
第八部分 扩展内容
config include
option path '/etc/firewall.user'
config include 'miniupnpd'
option type 'script'
option path '/usr/share/miniupnpd/firewall.include'
option family 'any'
option reload '1'
Simple DMZ rule The following rule redirects all WAN ports for all protocols to the internal host 192.168.1.2. config redirect option src wan option proto all option dest_ip 192.168.1.2
modprobe bonding mode=balance-rr miimon=100
#添加 bond 类型的虚拟接口 名称为 bond-wan
ip link add bond-wan type bond mode balance-rr
#将eth0添加到聚合接口
ifconfig eth0 down
ip link set eth0 master bond-wan
#将eth1添加到聚合接口
ifconfig eth1 down
ip link set eth1 master bond-wan
ip link set bond-wan up #启动该网卡
vi /home/admin/tftp/grub/grub.cfg
if loadfont /boot/grub/font.pf2 ; then
set gfxmode=auto
insmod efi_gop
insmod efi_uga
insmod gfxterm
terminal_output gfxterm
fi
set menu_color_normal=white/black
set menu_color_highlight=black/light-gray
set timeout=5
menuentry "Install Ubuntu Server" {
set gfxpayload=keep
linux /boot/live-server/vmlinuz root=/dev/ram0 ramdisk_size=1500000 ip=dhcp url='http://10.230.2.200:8000/iso/ubuntu-20.04.6-live-server-amd64.iso' autoinstall ds=nocloud-net\;s=http://10.230.2.200:8000/autoinstall/ ---
initrd /boot/live-server/initrd
}
# 创建并配置管理网的网桥,并将智能网卡的管理网IP放到此网桥上
root@OCTEONTX:~# ovs-vsctl add-br br-m -- set bridge br-m datapath_type=netdev
root@OCTEONTX:~# ip add del dev eth4 192.168.5.45/24
root@OCTEONTX:~# ovs-vsctl add-port br-m eth4
root@OCTEONTX:~# ip link set dev br-m up
root@OCTEONTX:~# ip add add dev br-m 192.168.5.45/24
root@OCTEONTX:~# ip route add default via 192.168.5.1 dev br-m
# 创建并配置业务网的网桥,将智能网卡的物理网口eth0连接到此网桥上
#查看智能网卡物理口PCI地址
root@OCTEONTX:/data/helium-v1.0# lspci|grep a063
0002:02:00.0 Ethernet controller: Cavium, Inc. Device a063 (rev 09)
0002:03:00.0 Ethernet controller: Cavium, Inc. Device a063 (rev 09)
0002:04:00.0 Ethernet controller: Cavium, Inc. Device a063 (rev 09)
0002:05:00.0 Ethernet controller: Cavium, Inc. Device a063 (rev 09)
root@OCTEONTX:~# ovs-vsctl add-br br-net -- set bridge br-net datapath_type=netdev
root@OCTEONTX:~# ovs-vsctl add-port br-net eth0 -- set Interface eth0 type=dpdk options:dpdk-devargs=0002:02:00.0 mtu_request=9000
root@OCTEONTX:~# ip link set dev br-net up
# 修改xml文件,将3.1小节创建的虚拟机重命名后用作WEB后端。
[root@asterfusion ~]# virsh shutdown centos-00
[root@asterfusion ~]# virsh shutdown centos-01
[root@asterfusion ~]# virsh domrename centos-00 WEB-00.xml
[root@asterfusion ~]# virsh domrename centos-01 WEB-01.xml
[root@asterfusion ~]# virsh start WEB-00
[root@asterfusion ~]# virsh start WEB-01
[root@asterfusion ~]# virsh list --all
Id Name State
----------------------------------------------------
13 WEB-00 running
14 WEB-01 running
# 重新给两台虚拟机配置管理IP。
# WEB-00:
[root@WEB-00 ~]# ip link set dev eth1 up
[root@WEB-00 ~]# ip add add dev eth1 192.168.5.155/24
[root@WEB-00 ~]# ip link set dev eth0 up
[root@WEB-00 ~]# ip add add dev eth0 172.0.0.100/24
[root@WEB-00 ~]# ip route add default via 172.0.0.1 dev eth0
# WEB-01:
[root@WEB-01 ~]# ip link set dev eth1 up
[root@WEB-01 ~]# ip add add dev eth1 192.168.5.165/24
[root@WEB-01 ~]# ip link set dev eth0 up
[root@WEB-01 ~]# ip add add dev eth0 172.0.0.200/24
[root@WEB-01 ~]# ip route add default via 172.0.0.1 dev eth1
# 删除3.2节用不到的端口及网桥。
root@OCTEONTX:~# ovs-vsctl del-port vm-net fw-if-in-sw
root@OCTEONTX:~# ovs-vsctl del-port br-net fw-if-ou-sw
root@OCTEONTX:~# ovs-vsctl del-port br-m fw-m-sw
root@OCTEONTX:~# ip link delete fw-if-in type veth peer name fw-if-in-sw
root@OCTEONTX:~# ip link delete fw-if-ou type veth peer name fw-if-ou-sw
root@OCTEONTX:~# ip link delete fw-m type veth peer name fw-m-sw
root@OCTEONTX:~# ipconfig vm-net 172.0.0.50/24
root@OCTEONTX:~# ipconfig br-net 10.0.0.50/24