4.5. 被配置为使用 RDMA 的 Training Operator PyTorchJob 资源示例
本例演示了如何创建配置为使用 Remote Direct Memory Access (RDMA)运行的 Training Operator PyTorch training 作业。
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: job
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: OnFailure
template:
metadata:
annotations:
k8s.v1.cni.cncf.io/networks: "example-net"
spec:
containers:
- command:
- /bin/bash
- -c
- "your container command"
env:
- name: NCCL_SOCKET_IFNAME
value: "net1"
- name: NCCL_IB_HCA
value: "mlx5_1"
image: quay.io/modh/training:py311-cuda121-torch241
name: pytorch
resources:
limits:
nvidia.com/gpu: "1"
rdma/rdma_shared_device_eth: "1"
requests:
nvidia.com/gpu: "1"
rdma/rdma_shared_device_eth: "1"
Worker:
replicas: 3
restartPolicy: OnFailure
template:
metadata:
annotations:
k8s.v1.cni.cncf.io/networks: "example-net"
spec:
containers:
- command:
- /bin/bash
- -c
- "your container command"
env:
- name: NCCL_SOCKET_IFNAME
value: "net1"
- name: NCCL_IB_HCA
value: "mlx5_1"
image: quay.io/modh/training:py311-cuda121-torch241
name: pytorch
resources:
limits:
nvidia.com/gpu: "1"
rdma/rdma_shared_device_eth: "1"
requests:
nvidia.com/gpu: "1"
rdma/rdma_shared_device_eth: "1"
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: job
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: OnFailure
template:
metadata:
annotations:
k8s.v1.cni.cncf.io/networks: "example-net"
spec:
containers:
- command:
- /bin/bash
- -c
- "your container command"
env:
- name: NCCL_SOCKET_IFNAME
value: "net1"
- name: NCCL_IB_HCA
value: "mlx5_1"
image: quay.io/modh/training:py311-cuda121-torch241
name: pytorch
resources:
limits:
nvidia.com/gpu: "1"
rdma/rdma_shared_device_eth: "1"
requests:
nvidia.com/gpu: "1"
rdma/rdma_shared_device_eth: "1"
Worker:
replicas: 3
restartPolicy: OnFailure
template:
metadata:
annotations:
k8s.v1.cni.cncf.io/networks: "example-net"
spec:
containers:
- command:
- /bin/bash
- -c
- "your container command"
env:
- name: NCCL_SOCKET_IFNAME
value: "net1"
- name: NCCL_IB_HCA
value: "mlx5_1"
image: quay.io/modh/training:py311-cuda121-torch241
name: pytorch
resources:
limits:
nvidia.com/gpu: "1"
rdma/rdma_shared_device_eth: "1"
requests:
nvidia.com/gpu: "1"
rdma/rdma_shared_device_eth: "1"