From dbe87e23a96e1d5b5be2d7008783e22133f04515 Mon Sep 17 00:00:00 2001 From: Nicolas De Loof Date: Thu, 17 Sep 2020 10:42:19 +0200 Subject: [PATCH] Guess AWS machine type based on service resources reservations Signed-off-by: Nicolas De Loof --- ecs/compatibility.go | 2 + ecs/gpu.go | 142 +++++++++++++++++++++++++++++++++++++++++++ ecs/gpu_test.go | 115 +++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 ecs/gpu.go create mode 100644 ecs/gpu_test.go diff --git a/ecs/compatibility.go b/ecs/compatibility.go index 605a86c74..2f634840d 100644 --- a/ecs/compatibility.go +++ b/ecs/compatibility.go @@ -38,6 +38,8 @@ var compatibleComposeAttributes = []string{ "services.deploy.resources.reservations", "services.deploy.resources.reservations.cpus", "services.deploy.resources.reservations.memory", + "services.deploy.resources.reservations.generic_resources", + "services.deploy.resources.reservations.generic_resources.discrete_resource_spec", "services.deploy.update_config", "services.deploy.update_config.parallelism", "services.entrypoint", diff --git a/ecs/gpu.go b/ecs/gpu.go new file mode 100644 index 000000000..60a2f4788 --- /dev/null +++ b/ecs/gpu.go @@ -0,0 +1,142 @@ +/* + Copyright 2020 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ecs + +import ( + "fmt" + "strconv" + + "github.com/compose-spec/compose-go/types" + "github.com/docker/go-units" +) + +type machine struct { + id string + cpus float64 + memory types.UnitBytes + gpus int64 +} + +type family []machine + +var p3family = family{ + { + id: "p3.2xlarge", + cpus: 8, + memory: 64 * units.GiB, + gpus: 2, + }, + { + id: "p3.8xlarge", + cpus: 32, + memory: 244 * units.GiB, + gpus: 4, + }, + { + id: "p3.16xlarge", + cpus: 64, + memory: 488 * units.GiB, + gpus: 8, + }, +} + +type filterFn func(machine) bool + +func (f family) filter(fn filterFn) family { + var filtered family + for _, machine := range f { + if fn(machine) { + filtered = append(filtered, machine) + } + } + return filtered +} + +func (f family) firstOrError(msg string, args ...interface{}) (machine, error) { + if len(f) == 0 { + return machine{}, fmt.Errorf(msg, args...) + } + return f[0], nil +} + +func guessMachineType(project *types.Project) (string, error) { + // we select a machine type to match all gpu-bound services requirements + // once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service. + minMemory, minCPU, minGPU, err := getResourceRequirements(project) + if err != nil { + return "", err + } + + instanceType, err := p3family. + filter(func(m machine) bool { + return m.memory >= minMemory + }). + filter(func(m machine) bool { + return m.cpus >= minCPU + }). + filter(func(m machine) bool { + return m.gpus >= minGPU + }). + firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpu:%d", minMemory, minCPU, minGPU) + if err != nil { + return "", err + } + return instanceType.id, nil +} + +func getResourceRequirements(project *types.Project) (types.UnitBytes, float64, int64, error) { + var minMemory types.UnitBytes + var minCPU float64 + var minGPU int64 + for _, service := range project.Services { + if service.Deploy == nil { + continue + } + reservations := service.Deploy.Resources.Reservations + if reservations == nil { + continue + } + + var requiredGPUs int64 + for _, r := range reservations.GenericResources { + if r.DiscreteResourceSpec.Kind == "gpu" { + requiredGPUs = r.DiscreteResourceSpec.Value + break + } + } + if requiredGPUs == 0 { + continue + } + if requiredGPUs > minGPU { + minGPU = requiredGPUs + } + + if reservations.MemoryBytes > minMemory { + minMemory = reservations.MemoryBytes + } + if reservations.NanoCPUs != "" { + nanocpu, err := strconv.ParseFloat(reservations.NanoCPUs, 64) + if err != nil { + return 0, 0, 0, err + } + if nanocpu > minCPU { + minCPU = nanocpu + } + } + } + return minMemory, minCPU, minGPU, nil +} diff --git a/ecs/gpu_test.go b/ecs/gpu_test.go new file mode 100644 index 000000000..8a3b65207 --- /dev/null +++ b/ecs/gpu_test.go @@ -0,0 +1,115 @@ +/* + Copyright 2020 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ecs + +import ( + "testing" +) + +func TestGuessMachineType(t *testing.T) { + tests := []struct { + name string + yaml string + want string + wantErr bool + }{ + { + name: "1-gpu", + yaml: ` +services: + learning: + image: tensorflow/tensorflow:latest-gpu + deploy: + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: gpu + value: 1 +`, + want: "p3.2xlarge", + wantErr: false, + }, + { + name: "4-gpu", + yaml: ` +services: + learning: + image: tensorflow/tensorflow:latest-gpu + deploy: + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: gpu + value: 4 +`, + want: "p3.8xlarge", + wantErr: false, + }, + { + name: "1-gpu, high-memory", + yaml: ` +services: + learning: + image: tensorflow/tensorflow:latest-gpu + deploy: + resources: + reservations: + memory: 300Gb + generic_resources: + - discrete_resource_spec: + kind: gpu + value: 2 +`, + want: "p3.16xlarge", + wantErr: false, + }, + { + name: "1-gpu, high-cpu", + yaml: ` +services: + learning: + image: tensorflow/tensorflow:latest-gpu + deploy: + resources: + reservations: + memory: 32Gb + cpus: "32" + generic_resources: + - discrete_resource_spec: + kind: gpu + value: 2 +`, + want: "p3.8xlarge", + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + project := loadConfig(t, tt.yaml) + got, err := guessMachineType(project) + if (err != nil) != tt.wantErr { + t.Errorf("guessMachineType() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("guessMachineType() got = %v, want %v", got, tt.want) + } + }) + } +}