mirror of
https://github.com/docker/compose.git
synced 2025-07-27 07:34:10 +02:00
Merge pull request #628 from docker/machine
Guess AWS machine type based on service resources reservations
This commit is contained in:
commit
57c14e70e1
@ -38,6 +38,8 @@ var compatibleComposeAttributes = []string{
|
|||||||
"services.deploy.resources.reservations",
|
"services.deploy.resources.reservations",
|
||||||
"services.deploy.resources.reservations.cpus",
|
"services.deploy.resources.reservations.cpus",
|
||||||
"services.deploy.resources.reservations.memory",
|
"services.deploy.resources.reservations.memory",
|
||||||
|
"services.deploy.resources.reservations.generic_resources",
|
||||||
|
"services.deploy.resources.reservations.generic_resources.discrete_resource_spec",
|
||||||
"services.deploy.update_config",
|
"services.deploy.update_config",
|
||||||
"services.deploy.update_config.parallelism",
|
"services.deploy.update_config.parallelism",
|
||||||
"services.entrypoint",
|
"services.entrypoint",
|
||||||
|
212
ecs/gpu.go
Normal file
212
ecs/gpu.go
Normal file
@ -0,0 +1,212 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Docker, Inc.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package ecs
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/compose-spec/compose-go/types"
|
||||||
|
"github.com/docker/go-units"
|
||||||
|
)
|
||||||
|
|
||||||
|
type machine struct {
|
||||||
|
id string
|
||||||
|
cpus float64
|
||||||
|
memory types.UnitBytes
|
||||||
|
gpus int64
|
||||||
|
}
|
||||||
|
|
||||||
|
type family []machine
|
||||||
|
|
||||||
|
var p3family = family{
|
||||||
|
{
|
||||||
|
id: "p3.2xlarge",
|
||||||
|
cpus: 8,
|
||||||
|
memory: 64 * units.GiB,
|
||||||
|
gpus: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "p3.8xlarge",
|
||||||
|
cpus: 32,
|
||||||
|
memory: 244 * units.GiB,
|
||||||
|
gpus: 4,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "p3.16xlarge",
|
||||||
|
cpus: 64,
|
||||||
|
memory: 488 * units.GiB,
|
||||||
|
gpus: 8,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
type filterFn func(machine) bool
|
||||||
|
|
||||||
|
func (f family) filter(fn filterFn) family {
|
||||||
|
var filtered family
|
||||||
|
for _, machine := range f {
|
||||||
|
if fn(machine) {
|
||||||
|
filtered = append(filtered, machine)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return filtered
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f family) firstOrError(msg string, args ...interface{}) (machine, error) {
|
||||||
|
if len(f) == 0 {
|
||||||
|
return machine{}, fmt.Errorf(msg, args...)
|
||||||
|
}
|
||||||
|
return f[0], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func guessMachineType(project *types.Project) (string, error) {
|
||||||
|
// we select a machine type to match all gpus-bound services requirements
|
||||||
|
// once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service.
|
||||||
|
requirements, err := getResourceRequirements(project)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
instanceType, err := p3family.
|
||||||
|
filter(func(m machine) bool {
|
||||||
|
return m.memory >= requirements.memory
|
||||||
|
}).
|
||||||
|
filter(func(m machine) bool {
|
||||||
|
return m.cpus >= requirements.cpus
|
||||||
|
}).
|
||||||
|
filter(func(m machine) bool {
|
||||||
|
return m.gpus >= requirements.gpus
|
||||||
|
}).
|
||||||
|
firstOrError("none of the Amazon EC2 P3 instance types meet the requirements for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return instanceType.id, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type resourceRequirements struct {
|
||||||
|
memory types.UnitBytes
|
||||||
|
cpus float64
|
||||||
|
gpus int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func getResourceRequirements(project *types.Project) (*resourceRequirements, error) {
|
||||||
|
return toResourceRequirementsSlice(project).
|
||||||
|
filter(func(requirements *resourceRequirements) bool {
|
||||||
|
return requirements.gpus != 0
|
||||||
|
}).
|
||||||
|
max()
|
||||||
|
}
|
||||||
|
|
||||||
|
type eitherRequirementsOrError struct {
|
||||||
|
requirements []*resourceRequirements
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func toResourceRequirementsSlice(project *types.Project) eitherRequirementsOrError {
|
||||||
|
var requirements []*resourceRequirements
|
||||||
|
for _, service := range project.Services {
|
||||||
|
r, err := toResourceRequirements(service)
|
||||||
|
if err != nil {
|
||||||
|
return eitherRequirementsOrError{nil, err}
|
||||||
|
}
|
||||||
|
requirements = append(requirements, r)
|
||||||
|
}
|
||||||
|
return eitherRequirementsOrError{requirements, nil}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r eitherRequirementsOrError) filter(fn func(*resourceRequirements) bool) eitherRequirementsOrError {
|
||||||
|
if r.err != nil {
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
var requirements []*resourceRequirements
|
||||||
|
for _, req := range r.requirements {
|
||||||
|
if fn(req) {
|
||||||
|
requirements = append(requirements, req)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return eitherRequirementsOrError{requirements, nil}
|
||||||
|
}
|
||||||
|
|
||||||
|
func toResourceRequirements(service types.ServiceConfig) (*resourceRequirements, error) {
|
||||||
|
if service.Deploy == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
reservations := service.Deploy.Resources.Reservations
|
||||||
|
if reservations == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var requiredGPUs int64
|
||||||
|
for _, r := range reservations.GenericResources {
|
||||||
|
if r.DiscreteResourceSpec.Kind == "gpus" {
|
||||||
|
requiredGPUs = r.DiscreteResourceSpec.Value
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var nanocpu float64
|
||||||
|
if reservations.NanoCPUs != "" {
|
||||||
|
v, err := strconv.ParseFloat(reservations.NanoCPUs, 64)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
nanocpu = v
|
||||||
|
}
|
||||||
|
return &resourceRequirements{
|
||||||
|
memory: reservations.MemoryBytes,
|
||||||
|
cpus: nanocpu,
|
||||||
|
gpus: requiredGPUs,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r resourceRequirements) combine(o *resourceRequirements) resourceRequirements {
|
||||||
|
if o == nil {
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
return resourceRequirements{
|
||||||
|
memory: maxUnitBytes(r.memory, o.memory),
|
||||||
|
cpus: math.Max(r.cpus, o.cpus),
|
||||||
|
gpus: maxInt64(r.gpus, o.gpus),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r eitherRequirementsOrError) max() (*resourceRequirements, error) {
|
||||||
|
if r.err != nil {
|
||||||
|
return nil, r.err
|
||||||
|
}
|
||||||
|
min := resourceRequirements{}
|
||||||
|
for _, req := range r.requirements {
|
||||||
|
min = min.combine(req)
|
||||||
|
}
|
||||||
|
return &min, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func maxInt64(a, b int64) int64 {
|
||||||
|
if a > b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
func maxUnitBytes(a, b types.UnitBytes) types.UnitBytes {
|
||||||
|
if a > b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
115
ecs/gpu_test.go
Normal file
115
ecs/gpu_test.go
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Docker, Inc.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package ecs
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGuessMachineType(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
yaml string
|
||||||
|
want string
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "1-gpus",
|
||||||
|
yaml: `
|
||||||
|
services:
|
||||||
|
learning:
|
||||||
|
image: tensorflow/tensorflow:latest-gpus
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
generic_resources:
|
||||||
|
- discrete_resource_spec:
|
||||||
|
kind: gpus
|
||||||
|
value: 1
|
||||||
|
`,
|
||||||
|
want: "p3.2xlarge",
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "4-gpus",
|
||||||
|
yaml: `
|
||||||
|
services:
|
||||||
|
learning:
|
||||||
|
image: tensorflow/tensorflow:latest-gpus
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
generic_resources:
|
||||||
|
- discrete_resource_spec:
|
||||||
|
kind: gpus
|
||||||
|
value: 4
|
||||||
|
`,
|
||||||
|
want: "p3.8xlarge",
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "1-gpus, high-memory",
|
||||||
|
yaml: `
|
||||||
|
services:
|
||||||
|
learning:
|
||||||
|
image: tensorflow/tensorflow:latest-gpus
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
memory: 300Gb
|
||||||
|
generic_resources:
|
||||||
|
- discrete_resource_spec:
|
||||||
|
kind: gpus
|
||||||
|
value: 2
|
||||||
|
`,
|
||||||
|
want: "p3.16xlarge",
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "1-gpus, high-cpu",
|
||||||
|
yaml: `
|
||||||
|
services:
|
||||||
|
learning:
|
||||||
|
image: tensorflow/tensorflow:latest-gpus
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
memory: 32Gb
|
||||||
|
cpus: "32"
|
||||||
|
generic_resources:
|
||||||
|
- discrete_resource_spec:
|
||||||
|
kind: gpus
|
||||||
|
value: 2
|
||||||
|
`,
|
||||||
|
want: "p3.8xlarge",
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
project := loadConfig(t, tt.yaml)
|
||||||
|
got, err := guessMachineType(project)
|
||||||
|
if (err != nil) != tt.wantErr {
|
||||||
|
t.Errorf("guessMachineType() error = %v, wantErr %v", err, tt.wantErr)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if got != tt.want {
|
||||||
|
t.Errorf("guessMachineType() got = %v, want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user