From dbe87e23a96e1d5b5be2d7008783e22133f04515 Mon Sep 17 00:00:00 2001 From: Nicolas De Loof Date: Thu, 17 Sep 2020 10:42:19 +0200 Subject: [PATCH 1/3] Guess AWS machine type based on service resources reservations Signed-off-by: Nicolas De Loof --- ecs/compatibility.go | 2 + ecs/gpu.go | 142 +++++++++++++++++++++++++++++++++++++++++++ ecs/gpu_test.go | 115 +++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 ecs/gpu.go create mode 100644 ecs/gpu_test.go diff --git a/ecs/compatibility.go b/ecs/compatibility.go index 605a86c74..2f634840d 100644 --- a/ecs/compatibility.go +++ b/ecs/compatibility.go @@ -38,6 +38,8 @@ var compatibleComposeAttributes = []string{ "services.deploy.resources.reservations", "services.deploy.resources.reservations.cpus", "services.deploy.resources.reservations.memory", + "services.deploy.resources.reservations.generic_resources", + "services.deploy.resources.reservations.generic_resources.discrete_resource_spec", "services.deploy.update_config", "services.deploy.update_config.parallelism", "services.entrypoint", diff --git a/ecs/gpu.go b/ecs/gpu.go new file mode 100644 index 000000000..60a2f4788 --- /dev/null +++ b/ecs/gpu.go @@ -0,0 +1,142 @@ +/* + Copyright 2020 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ecs + +import ( + "fmt" + "strconv" + + "github.com/compose-spec/compose-go/types" + "github.com/docker/go-units" +) + +type machine struct { + id string + cpus float64 + memory types.UnitBytes + gpus int64 +} + +type family []machine + +var p3family = family{ + { + id: "p3.2xlarge", + cpus: 8, + memory: 64 * units.GiB, + gpus: 2, + }, + { + id: "p3.8xlarge", + cpus: 32, + memory: 244 * units.GiB, + gpus: 4, + }, + { + id: "p3.16xlarge", + cpus: 64, + memory: 488 * units.GiB, + gpus: 8, + }, +} + +type filterFn func(machine) bool + +func (f family) filter(fn filterFn) family { + var filtered family + for _, machine := range f { + if fn(machine) { + filtered = append(filtered, machine) + } + } + return filtered +} + +func (f family) firstOrError(msg string, args ...interface{}) (machine, error) { + if len(f) == 0 { + return machine{}, fmt.Errorf(msg, args...) + } + return f[0], nil +} + +func guessMachineType(project *types.Project) (string, error) { + // we select a machine type to match all gpu-bound services requirements + // once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service. + minMemory, minCPU, minGPU, err := getResourceRequirements(project) + if err != nil { + return "", err + } + + instanceType, err := p3family. + filter(func(m machine) bool { + return m.memory >= minMemory + }). + filter(func(m machine) bool { + return m.cpus >= minCPU + }). + filter(func(m machine) bool { + return m.gpus >= minGPU + }). + firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpu:%d", minMemory, minCPU, minGPU) + if err != nil { + return "", err + } + return instanceType.id, nil +} + +func getResourceRequirements(project *types.Project) (types.UnitBytes, float64, int64, error) { + var minMemory types.UnitBytes + var minCPU float64 + var minGPU int64 + for _, service := range project.Services { + if service.Deploy == nil { + continue + } + reservations := service.Deploy.Resources.Reservations + if reservations == nil { + continue + } + + var requiredGPUs int64 + for _, r := range reservations.GenericResources { + if r.DiscreteResourceSpec.Kind == "gpu" { + requiredGPUs = r.DiscreteResourceSpec.Value + break + } + } + if requiredGPUs == 0 { + continue + } + if requiredGPUs > minGPU { + minGPU = requiredGPUs + } + + if reservations.MemoryBytes > minMemory { + minMemory = reservations.MemoryBytes + } + if reservations.NanoCPUs != "" { + nanocpu, err := strconv.ParseFloat(reservations.NanoCPUs, 64) + if err != nil { + return 0, 0, 0, err + } + if nanocpu > minCPU { + minCPU = nanocpu + } + } + } + return minMemory, minCPU, minGPU, nil +} diff --git a/ecs/gpu_test.go b/ecs/gpu_test.go new file mode 100644 index 000000000..8a3b65207 --- /dev/null +++ b/ecs/gpu_test.go @@ -0,0 +1,115 @@ +/* + Copyright 2020 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ecs + +import ( + "testing" +) + +func TestGuessMachineType(t *testing.T) { + tests := []struct { + name string + yaml string + want string + wantErr bool + }{ + { + name: "1-gpu", + yaml: ` +services: + learning: + image: tensorflow/tensorflow:latest-gpu + deploy: + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: gpu + value: 1 +`, + want: "p3.2xlarge", + wantErr: false, + }, + { + name: "4-gpu", + yaml: ` +services: + learning: + image: tensorflow/tensorflow:latest-gpu + deploy: + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: gpu + value: 4 +`, + want: "p3.8xlarge", + wantErr: false, + }, + { + name: "1-gpu, high-memory", + yaml: ` +services: + learning: + image: tensorflow/tensorflow:latest-gpu + deploy: + resources: + reservations: + memory: 300Gb + generic_resources: + - discrete_resource_spec: + kind: gpu + value: 2 +`, + want: "p3.16xlarge", + wantErr: false, + }, + { + name: "1-gpu, high-cpu", + yaml: ` +services: + learning: + image: tensorflow/tensorflow:latest-gpu + deploy: + resources: + reservations: + memory: 32Gb + cpus: "32" + generic_resources: + - discrete_resource_spec: + kind: gpu + value: 2 +`, + want: "p3.8xlarge", + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + project := loadConfig(t, tt.yaml) + got, err := guessMachineType(project) + if (err != nil) != tt.wantErr { + t.Errorf("guessMachineType() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("guessMachineType() got = %v, want %v", got, tt.want) + } + }) + } +} From 101e1555b81bc9fe1bf42aefc85641e13231ef25 Mon Sep 17 00:00:00 2001 From: Nicolas De Loof Date: Thu, 17 Sep 2020 12:24:11 +0200 Subject: [PATCH 2/3] Some more functional design Signed-off-by: Nicolas De Loof --- ecs/gpu.go | 154 +++++++++++++++++++++++++++++++++++------------- ecs/gpu_test.go | 24 ++++---- 2 files changed, 124 insertions(+), 54 deletions(-) diff --git a/ecs/gpu.go b/ecs/gpu.go index 60a2f4788..6ff01de73 100644 --- a/ecs/gpu.go +++ b/ecs/gpu.go @@ -18,6 +18,7 @@ package ecs import ( "fmt" + "math" "strconv" "github.com/compose-spec/compose-go/types" @@ -74,69 +75,138 @@ func (f family) firstOrError(msg string, args ...interface{}) (machine, error) { } func guessMachineType(project *types.Project) (string, error) { - // we select a machine type to match all gpu-bound services requirements + // we select a machine type to match all gpus-bound services requirements // once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service. - minMemory, minCPU, minGPU, err := getResourceRequirements(project) + requirements, err := getResourceRequirements(project) if err != nil { return "", err } instanceType, err := p3family. filter(func(m machine) bool { - return m.memory >= minMemory + return m.memory >= requirements.memory }). filter(func(m machine) bool { - return m.cpus >= minCPU + return m.cpus >= requirements.cpus }). filter(func(m machine) bool { - return m.gpus >= minGPU + return m.gpus >= requirements.gpus }). - firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpu:%d", minMemory, minCPU, minGPU) + firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus) if err != nil { return "", err } return instanceType.id, nil } -func getResourceRequirements(project *types.Project) (types.UnitBytes, float64, int64, error) { - var minMemory types.UnitBytes - var minCPU float64 - var minGPU int64 +type resourceRequirements struct { + memory types.UnitBytes + cpus float64 + gpus int64 +} + +func getResourceRequirements(project *types.Project) (*resourceRequirements, error) { + return toResourceRequirementsSlice(project). + filter(func(requirements *resourceRequirements) bool { + return requirements.gpus != 0 + }). + max() +} + +type eitherRequirementsOrError struct { + requirements []*resourceRequirements + err error +} + +func toResourceRequirementsSlice(project *types.Project) eitherRequirementsOrError { + var requirements []*resourceRequirements for _, service := range project.Services { - if service.Deploy == nil { - continue - } - reservations := service.Deploy.Resources.Reservations - if reservations == nil { - continue + r, err := toResourceRequirements(service) + if err != nil { + return eitherRequirementsOrError{nil, err} } + requirements = append(requirements, r) + } + return eitherRequirementsOrError{requirements, nil} +} - var requiredGPUs int64 - for _, r := range reservations.GenericResources { - if r.DiscreteResourceSpec.Kind == "gpu" { - requiredGPUs = r.DiscreteResourceSpec.Value - break - } - } - if requiredGPUs == 0 { - continue - } - if requiredGPUs > minGPU { - minGPU = requiredGPUs - } - - if reservations.MemoryBytes > minMemory { - minMemory = reservations.MemoryBytes - } - if reservations.NanoCPUs != "" { - nanocpu, err := strconv.ParseFloat(reservations.NanoCPUs, 64) - if err != nil { - return 0, 0, 0, err - } - if nanocpu > minCPU { - minCPU = nanocpu - } +func (r eitherRequirementsOrError) filter(fn func(*resourceRequirements) bool) eitherRequirementsOrError { + if r.err != nil { + return r + } + var requirements []*resourceRequirements + for _, req := range r.requirements { + if fn(req) { + requirements = append(requirements, req) } } - return minMemory, minCPU, minGPU, nil + return eitherRequirementsOrError{requirements, nil} +} + +func toResourceRequirements(service types.ServiceConfig) (*resourceRequirements, error) { + if service.Deploy == nil { + return nil, nil + } + reservations := service.Deploy.Resources.Reservations + if reservations == nil { + return nil, nil + } + + var requiredGPUs int64 + for _, r := range reservations.GenericResources { + if r.DiscreteResourceSpec.Kind == "gpus" { + requiredGPUs = r.DiscreteResourceSpec.Value + break + } + } + + var nanocpu float64 + if reservations.NanoCPUs != "" { + v, err := strconv.ParseFloat(reservations.NanoCPUs, 64) + if err != nil { + return nil, err + } + nanocpu = v + } + return &resourceRequirements{ + memory: reservations.MemoryBytes, + cpus: nanocpu, + gpus: requiredGPUs, + }, nil +} + +func (r resourceRequirements) combine(o *resourceRequirements) resourceRequirements { + if o == nil { + return r + } + return resourceRequirements{ + memory: maxUnitBytes(r.memory, o.memory), + cpus: math.Max(r.cpus, o.cpus), + gpus: maxInt64(r.gpus, o.gpus), + } +} + +func (r eitherRequirementsOrError) max() (*resourceRequirements, error) { + if r.err != nil { + return nil, r.err + } + min := resourceRequirements{} + for _, req := range r.requirements { + min = min.combine(req) + } + return &min, nil +} + +func maxInt64(a, b int64) int64 { + if a > b { + return a + } + return b +} + +func maxUnitBytes(a, b types.UnitBytes) types.UnitBytes { + if a > b { + return a + } + return b } diff --git a/ecs/gpu_test.go b/ecs/gpu_test.go index 8a3b65207..d556e22f9 100644 --- a/ecs/gpu_test.go +++ b/ecs/gpu_test.go @@ -28,63 +28,63 @@ func TestGuessMachineType(t *testing.T) { wantErr bool }{ { - name: "1-gpu", + name: "1-gpus", yaml: ` services: learning: - image: tensorflow/tensorflow:latest-gpu + image: tensorflow/tensorflow:latest-gpus deploy: resources: reservations: generic_resources: - discrete_resource_spec: - kind: gpu + kind: gpus value: 1 `, want: "p3.2xlarge", wantErr: false, }, { - name: "4-gpu", + name: "4-gpus", yaml: ` services: learning: - image: tensorflow/tensorflow:latest-gpu + image: tensorflow/tensorflow:latest-gpus deploy: resources: reservations: generic_resources: - discrete_resource_spec: - kind: gpu + kind: gpus value: 4 `, want: "p3.8xlarge", wantErr: false, }, { - name: "1-gpu, high-memory", + name: "1-gpus, high-memory", yaml: ` services: learning: - image: tensorflow/tensorflow:latest-gpu + image: tensorflow/tensorflow:latest-gpus deploy: resources: reservations: memory: 300Gb generic_resources: - discrete_resource_spec: - kind: gpu + kind: gpus value: 2 `, want: "p3.16xlarge", wantErr: false, }, { - name: "1-gpu, high-cpu", + name: "1-gpus, high-cpu", yaml: ` services: learning: - image: tensorflow/tensorflow:latest-gpu + image: tensorflow/tensorflow:latest-gpus deploy: resources: reservations: @@ -92,7 +92,7 @@ services: cpus: "32" generic_resources: - discrete_resource_spec: - kind: gpu + kind: gpus value: 2 `, want: "p3.8xlarge", From b22ebd61c4c8f0bc86e47fa5ddc66cfb02e5a655 Mon Sep 17 00:00:00 2001 From: Nicolas De loof Date: Fri, 18 Sep 2020 16:44:17 +0200 Subject: [PATCH 3/3] minor error improvement to match AWS docs Signed-off-by: Nicolas De Loof Co-authored-by: Chris Crone --- ecs/gpu.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecs/gpu.go b/ecs/gpu.go index 6ff01de73..16fa412f5 100644 --- a/ecs/gpu.go +++ b/ecs/gpu.go @@ -92,7 +92,7 @@ func guessMachineType(project *types.Project) (string, error) { filter(func(m machine) bool { return m.gpus >= requirements.gpus }). - firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus) + firstOrError("none of the Amazon EC2 P3 instance types meet the requirements for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus) if err != nil { return "", err }