-
Notifications
You must be signed in to change notification settings - Fork 706
/
imagenet.yaml
43 lines (43 loc) · 1.11 KB
/
imagenet.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: elastic-example-imagenet
spec:
elasticPolicy:
rdzvBackend: c10d
minReplicas: 1
maxReplicas: 3
maxRestarts: 100
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 80
pytorchReplicaSpecs:
Worker:
replicas: 2
restartPolicy: OnFailure
template:
spec:
containers:
- name: pytorch
image: kubeflow/pytorch-elastic-example-imagenet:latest
imagePullPolicy: IfNotPresent
resources:
requests:
cpu: 4
env:
- name: LOGLEVEL
value: DEBUG
command:
- python
- -m
- torch.distributed.run
- /workspace/examples/imagenet.py
- "--arch=resnet18"
- "--epochs=1"
- "--batch-size=32"
- "--workers=0"
- "/workspace/data/tiny-imagenet-200"