diff --git a/ecs/gpu.go b/ecs/gpu.go index 60a2f4788..6ff01de73 100644 --- a/ecs/gpu.go +++ b/ecs/gpu.go @@ -18,6 +18,7 @@ package ecs import ( "fmt" + "math" "strconv" "github.com/compose-spec/compose-go/types" @@ -74,69 +75,138 @@ func (f family) firstOrError(msg string, args ...interface{}) (machine, error) { } func guessMachineType(project *types.Project) (string, error) { - // we select a machine type to match all gpu-bound services requirements + // we select a machine type to match all gpus-bound services requirements // once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service. - minMemory, minCPU, minGPU, err := getResourceRequirements(project) + requirements, err := getResourceRequirements(project) if err != nil { return "", err } instanceType, err := p3family. filter(func(m machine) bool { - return m.memory >= minMemory + return m.memory >= requirements.memory }). filter(func(m machine) bool { - return m.cpus >= minCPU + return m.cpus >= requirements.cpus }). filter(func(m machine) bool { - return m.gpus >= minGPU + return m.gpus >= requirements.gpus }). - firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpu:%d", minMemory, minCPU, minGPU) + firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus) if err != nil { return "", err } return instanceType.id, nil } -func getResourceRequirements(project *types.Project) (types.UnitBytes, float64, int64, error) { - var minMemory types.UnitBytes - var minCPU float64 - var minGPU int64 +type resourceRequirements struct { + memory types.UnitBytes + cpus float64 + gpus int64 +} + +func getResourceRequirements(project *types.Project) (*resourceRequirements, error) { + return toResourceRequirementsSlice(project). + filter(func(requirements *resourceRequirements) bool { + return requirements.gpus != 0 + }). + max() +} + +type eitherRequirementsOrError struct { + requirements []*resourceRequirements + err error +} + +func toResourceRequirementsSlice(project *types.Project) eitherRequirementsOrError { + var requirements []*resourceRequirements for _, service := range project.Services { - if service.Deploy == nil { - continue - } - reservations := service.Deploy.Resources.Reservations - if reservations == nil { - continue + r, err := toResourceRequirements(service) + if err != nil { + return eitherRequirementsOrError{nil, err} } + requirements = append(requirements, r) + } + return eitherRequirementsOrError{requirements, nil} +} - var requiredGPUs int64 - for _, r := range reservations.GenericResources { - if r.DiscreteResourceSpec.Kind == "gpu" { - requiredGPUs = r.DiscreteResourceSpec.Value - break - } - } - if requiredGPUs == 0 { - continue - } - if requiredGPUs > minGPU { - minGPU = requiredGPUs - } - - if reservations.MemoryBytes > minMemory { - minMemory = reservations.MemoryBytes - } - if reservations.NanoCPUs != "" { - nanocpu, err := strconv.ParseFloat(reservations.NanoCPUs, 64) - if err != nil { - return 0, 0, 0, err - } - if nanocpu > minCPU { - minCPU = nanocpu - } +func (r eitherRequirementsOrError) filter(fn func(*resourceRequirements) bool) eitherRequirementsOrError { + if r.err != nil { + return r + } + var requirements []*resourceRequirements + for _, req := range r.requirements { + if fn(req) { + requirements = append(requirements, req) } } - return minMemory, minCPU, minGPU, nil + return eitherRequirementsOrError{requirements, nil} +} + +func toResourceRequirements(service types.ServiceConfig) (*resourceRequirements, error) { + if service.Deploy == nil { + return nil, nil + } + reservations := service.Deploy.Resources.Reservations + if reservations == nil { + return nil, nil + } + + var requiredGPUs int64 + for _, r := range reservations.GenericResources { + if r.DiscreteResourceSpec.Kind == "gpus" { + requiredGPUs = r.DiscreteResourceSpec.Value + break + } + } + + var nanocpu float64 + if reservations.NanoCPUs != "" { + v, err := strconv.ParseFloat(reservations.NanoCPUs, 64) + if err != nil { + return nil, err + } + nanocpu = v + } + return &resourceRequirements{ + memory: reservations.MemoryBytes, + cpus: nanocpu, + gpus: requiredGPUs, + }, nil +} + +func (r resourceRequirements) combine(o *resourceRequirements) resourceRequirements { + if o == nil { + return r + } + return resourceRequirements{ + memory: maxUnitBytes(r.memory, o.memory), + cpus: math.Max(r.cpus, o.cpus), + gpus: maxInt64(r.gpus, o.gpus), + } +} + +func (r eitherRequirementsOrError) max() (*resourceRequirements, error) { + if r.err != nil { + return nil, r.err + } + min := resourceRequirements{} + for _, req := range r.requirements { + min = min.combine(req) + } + return &min, nil +} + +func maxInt64(a, b int64) int64 { + if a > b { + return a + } + return b +} + +func maxUnitBytes(a, b types.UnitBytes) types.UnitBytes { + if a > b { + return a + } + return b } diff --git a/ecs/gpu_test.go b/ecs/gpu_test.go index 8a3b65207..d556e22f9 100644 --- a/ecs/gpu_test.go +++ b/ecs/gpu_test.go @@ -28,63 +28,63 @@ func TestGuessMachineType(t *testing.T) { wantErr bool }{ { - name: "1-gpu", + name: "1-gpus", yaml: ` services: learning: - image: tensorflow/tensorflow:latest-gpu + image: tensorflow/tensorflow:latest-gpus deploy: resources: reservations: generic_resources: - discrete_resource_spec: - kind: gpu + kind: gpus value: 1 `, want: "p3.2xlarge", wantErr: false, }, { - name: "4-gpu", + name: "4-gpus", yaml: ` services: learning: - image: tensorflow/tensorflow:latest-gpu + image: tensorflow/tensorflow:latest-gpus deploy: resources: reservations: generic_resources: - discrete_resource_spec: - kind: gpu + kind: gpus value: 4 `, want: "p3.8xlarge", wantErr: false, }, { - name: "1-gpu, high-memory", + name: "1-gpus, high-memory", yaml: ` services: learning: - image: tensorflow/tensorflow:latest-gpu + image: tensorflow/tensorflow:latest-gpus deploy: resources: reservations: memory: 300Gb generic_resources: - discrete_resource_spec: - kind: gpu + kind: gpus value: 2 `, want: "p3.16xlarge", wantErr: false, }, { - name: "1-gpu, high-cpu", + name: "1-gpus, high-cpu", yaml: ` services: learning: - image: tensorflow/tensorflow:latest-gpu + image: tensorflow/tensorflow:latest-gpus deploy: resources: reservations: @@ -92,7 +92,7 @@ services: cpus: "32" generic_resources: - discrete_resource_spec: - kind: gpu + kind: gpus value: 2 `, want: "p3.8xlarge",