Merge pull request #628 from docker/machine

Guess AWS machine type based on service resources reservations
This commit is contained in:
Nicolas De loof 2020-09-18 17:05:54 +02:00 committed by GitHub
commit 57c14e70e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 329 additions and 0 deletions

View File

@ -38,6 +38,8 @@ var compatibleComposeAttributes = []string{
"services.deploy.resources.reservations",
"services.deploy.resources.reservations.cpus",
"services.deploy.resources.reservations.memory",
"services.deploy.resources.reservations.generic_resources",
"services.deploy.resources.reservations.generic_resources.discrete_resource_spec",
"services.deploy.update_config",
"services.deploy.update_config.parallelism",
"services.entrypoint",

212
ecs/gpu.go Normal file
View File

@ -0,0 +1,212 @@
/*
Copyright 2020 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ecs
import (
"fmt"
"math"
"strconv"
"github.com/compose-spec/compose-go/types"
"github.com/docker/go-units"
)
type machine struct {
id string
cpus float64
memory types.UnitBytes
gpus int64
}
type family []machine
var p3family = family{
{
id: "p3.2xlarge",
cpus: 8,
memory: 64 * units.GiB,
gpus: 2,
},
{
id: "p3.8xlarge",
cpus: 32,
memory: 244 * units.GiB,
gpus: 4,
},
{
id: "p3.16xlarge",
cpus: 64,
memory: 488 * units.GiB,
gpus: 8,
},
}
type filterFn func(machine) bool
func (f family) filter(fn filterFn) family {
var filtered family
for _, machine := range f {
if fn(machine) {
filtered = append(filtered, machine)
}
}
return filtered
}
func (f family) firstOrError(msg string, args ...interface{}) (machine, error) {
if len(f) == 0 {
return machine{}, fmt.Errorf(msg, args...)
}
return f[0], nil
}
func guessMachineType(project *types.Project) (string, error) {
// we select a machine type to match all gpus-bound services requirements
// once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service.
requirements, err := getResourceRequirements(project)
if err != nil {
return "", err
}
instanceType, err := p3family.
filter(func(m machine) bool {
return m.memory >= requirements.memory
}).
filter(func(m machine) bool {
return m.cpus >= requirements.cpus
}).
filter(func(m machine) bool {
return m.gpus >= requirements.gpus
}).
firstOrError("none of the Amazon EC2 P3 instance types meet the requirements for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus)
if err != nil {
return "", err
}
return instanceType.id, nil
}
type resourceRequirements struct {
memory types.UnitBytes
cpus float64
gpus int64
}
func getResourceRequirements(project *types.Project) (*resourceRequirements, error) {
return toResourceRequirementsSlice(project).
filter(func(requirements *resourceRequirements) bool {
return requirements.gpus != 0
}).
max()
}
type eitherRequirementsOrError struct {
requirements []*resourceRequirements
err error
}
func toResourceRequirementsSlice(project *types.Project) eitherRequirementsOrError {
var requirements []*resourceRequirements
for _, service := range project.Services {
r, err := toResourceRequirements(service)
if err != nil {
return eitherRequirementsOrError{nil, err}
}
requirements = append(requirements, r)
}
return eitherRequirementsOrError{requirements, nil}
}
func (r eitherRequirementsOrError) filter(fn func(*resourceRequirements) bool) eitherRequirementsOrError {
if r.err != nil {
return r
}
var requirements []*resourceRequirements
for _, req := range r.requirements {
if fn(req) {
requirements = append(requirements, req)
}
}
return eitherRequirementsOrError{requirements, nil}
}
func toResourceRequirements(service types.ServiceConfig) (*resourceRequirements, error) {
if service.Deploy == nil {
return nil, nil
}
reservations := service.Deploy.Resources.Reservations
if reservations == nil {
return nil, nil
}
var requiredGPUs int64
for _, r := range reservations.GenericResources {
if r.DiscreteResourceSpec.Kind == "gpus" {
requiredGPUs = r.DiscreteResourceSpec.Value
break
}
}
var nanocpu float64
if reservations.NanoCPUs != "" {
v, err := strconv.ParseFloat(reservations.NanoCPUs, 64)
if err != nil {
return nil, err
}
nanocpu = v
}
return &resourceRequirements{
memory: reservations.MemoryBytes,
cpus: nanocpu,
gpus: requiredGPUs,
}, nil
}
func (r resourceRequirements) combine(o *resourceRequirements) resourceRequirements {
if o == nil {
return r
}
return resourceRequirements{
memory: maxUnitBytes(r.memory, o.memory),
cpus: math.Max(r.cpus, o.cpus),
gpus: maxInt64(r.gpus, o.gpus),
}
}
func (r eitherRequirementsOrError) max() (*resourceRequirements, error) {
if r.err != nil {
return nil, r.err
}
min := resourceRequirements{}
for _, req := range r.requirements {
min = min.combine(req)
}
return &min, nil
}
func maxInt64(a, b int64) int64 {
if a > b {
return a
}
return b
}
func maxUnitBytes(a, b types.UnitBytes) types.UnitBytes {
if a > b {
return a
}
return b
}

115
ecs/gpu_test.go Normal file
View File

@ -0,0 +1,115 @@
/*
Copyright 2020 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ecs
import (
"testing"
)
func TestGuessMachineType(t *testing.T) {
tests := []struct {
name string
yaml string
want string
wantErr bool
}{
{
name: "1-gpus",
yaml: `
services:
learning:
image: tensorflow/tensorflow:latest-gpus
deploy:
resources:
reservations:
generic_resources:
- discrete_resource_spec:
kind: gpus
value: 1
`,
want: "p3.2xlarge",
wantErr: false,
},
{
name: "4-gpus",
yaml: `
services:
learning:
image: tensorflow/tensorflow:latest-gpus
deploy:
resources:
reservations:
generic_resources:
- discrete_resource_spec:
kind: gpus
value: 4
`,
want: "p3.8xlarge",
wantErr: false,
},
{
name: "1-gpus, high-memory",
yaml: `
services:
learning:
image: tensorflow/tensorflow:latest-gpus
deploy:
resources:
reservations:
memory: 300Gb
generic_resources:
- discrete_resource_spec:
kind: gpus
value: 2
`,
want: "p3.16xlarge",
wantErr: false,
},
{
name: "1-gpus, high-cpu",
yaml: `
services:
learning:
image: tensorflow/tensorflow:latest-gpus
deploy:
resources:
reservations:
memory: 32Gb
cpus: "32"
generic_resources:
- discrete_resource_spec:
kind: gpus
value: 2
`,
want: "p3.8xlarge",
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
project := loadConfig(t, tt.yaml)
got, err := guessMachineType(project)
if (err != nil) != tt.wantErr {
t.Errorf("guessMachineType() error = %v, wantErr %v", err, tt.wantErr)
return
}
if got != tt.want {
t.Errorf("guessMachineType() got = %v, want %v", got, tt.want)
}
})
}
}