mirror of
https://github.com/docker/compose.git
synced 2025-07-24 22:24:41 +02:00
Some more functional design
Signed-off-by: Nicolas De Loof <nicolas.deloof@gmail.com>
This commit is contained in:
parent
dbe87e23a9
commit
101e1555b8
154
ecs/gpu.go
154
ecs/gpu.go
@ -18,6 +18,7 @@ package ecs
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
"github.com/compose-spec/compose-go/types"
|
"github.com/compose-spec/compose-go/types"
|
||||||
@ -74,69 +75,138 @@ func (f family) firstOrError(msg string, args ...interface{}) (machine, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func guessMachineType(project *types.Project) (string, error) {
|
func guessMachineType(project *types.Project) (string, error) {
|
||||||
// we select a machine type to match all gpu-bound services requirements
|
// we select a machine type to match all gpus-bound services requirements
|
||||||
// once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service.
|
// once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service.
|
||||||
minMemory, minCPU, minGPU, err := getResourceRequirements(project)
|
requirements, err := getResourceRequirements(project)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
instanceType, err := p3family.
|
instanceType, err := p3family.
|
||||||
filter(func(m machine) bool {
|
filter(func(m machine) bool {
|
||||||
return m.memory >= minMemory
|
return m.memory >= requirements.memory
|
||||||
}).
|
}).
|
||||||
filter(func(m machine) bool {
|
filter(func(m machine) bool {
|
||||||
return m.cpus >= minCPU
|
return m.cpus >= requirements.cpus
|
||||||
}).
|
}).
|
||||||
filter(func(m machine) bool {
|
filter(func(m machine) bool {
|
||||||
return m.gpus >= minGPU
|
return m.gpus >= requirements.gpus
|
||||||
}).
|
}).
|
||||||
firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpu:%d", minMemory, minCPU, minGPU)
|
firstOrError("none of the AWS p3 machines match requirement for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return instanceType.id, nil
|
return instanceType.id, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getResourceRequirements(project *types.Project) (types.UnitBytes, float64, int64, error) {
|
type resourceRequirements struct {
|
||||||
var minMemory types.UnitBytes
|
memory types.UnitBytes
|
||||||
var minCPU float64
|
cpus float64
|
||||||
var minGPU int64
|
gpus int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func getResourceRequirements(project *types.Project) (*resourceRequirements, error) {
|
||||||
|
return toResourceRequirementsSlice(project).
|
||||||
|
filter(func(requirements *resourceRequirements) bool {
|
||||||
|
return requirements.gpus != 0
|
||||||
|
}).
|
||||||
|
max()
|
||||||
|
}
|
||||||
|
|
||||||
|
type eitherRequirementsOrError struct {
|
||||||
|
requirements []*resourceRequirements
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func toResourceRequirementsSlice(project *types.Project) eitherRequirementsOrError {
|
||||||
|
var requirements []*resourceRequirements
|
||||||
for _, service := range project.Services {
|
for _, service := range project.Services {
|
||||||
if service.Deploy == nil {
|
r, err := toResourceRequirements(service)
|
||||||
continue
|
if err != nil {
|
||||||
}
|
return eitherRequirementsOrError{nil, err}
|
||||||
reservations := service.Deploy.Resources.Reservations
|
|
||||||
if reservations == nil {
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
requirements = append(requirements, r)
|
||||||
|
}
|
||||||
|
return eitherRequirementsOrError{requirements, nil}
|
||||||
|
}
|
||||||
|
|
||||||
var requiredGPUs int64
|
func (r eitherRequirementsOrError) filter(fn func(*resourceRequirements) bool) eitherRequirementsOrError {
|
||||||
for _, r := range reservations.GenericResources {
|
if r.err != nil {
|
||||||
if r.DiscreteResourceSpec.Kind == "gpu" {
|
return r
|
||||||
requiredGPUs = r.DiscreteResourceSpec.Value
|
}
|
||||||
break
|
var requirements []*resourceRequirements
|
||||||
}
|
for _, req := range r.requirements {
|
||||||
}
|
if fn(req) {
|
||||||
if requiredGPUs == 0 {
|
requirements = append(requirements, req)
|
||||||
continue
|
|
||||||
}
|
|
||||||
if requiredGPUs > minGPU {
|
|
||||||
minGPU = requiredGPUs
|
|
||||||
}
|
|
||||||
|
|
||||||
if reservations.MemoryBytes > minMemory {
|
|
||||||
minMemory = reservations.MemoryBytes
|
|
||||||
}
|
|
||||||
if reservations.NanoCPUs != "" {
|
|
||||||
nanocpu, err := strconv.ParseFloat(reservations.NanoCPUs, 64)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, 0, err
|
|
||||||
}
|
|
||||||
if nanocpu > minCPU {
|
|
||||||
minCPU = nanocpu
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return minMemory, minCPU, minGPU, nil
|
return eitherRequirementsOrError{requirements, nil}
|
||||||
|
}
|
||||||
|
|
||||||
|
func toResourceRequirements(service types.ServiceConfig) (*resourceRequirements, error) {
|
||||||
|
if service.Deploy == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
reservations := service.Deploy.Resources.Reservations
|
||||||
|
if reservations == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var requiredGPUs int64
|
||||||
|
for _, r := range reservations.GenericResources {
|
||||||
|
if r.DiscreteResourceSpec.Kind == "gpus" {
|
||||||
|
requiredGPUs = r.DiscreteResourceSpec.Value
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var nanocpu float64
|
||||||
|
if reservations.NanoCPUs != "" {
|
||||||
|
v, err := strconv.ParseFloat(reservations.NanoCPUs, 64)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
nanocpu = v
|
||||||
|
}
|
||||||
|
return &resourceRequirements{
|
||||||
|
memory: reservations.MemoryBytes,
|
||||||
|
cpus: nanocpu,
|
||||||
|
gpus: requiredGPUs,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r resourceRequirements) combine(o *resourceRequirements) resourceRequirements {
|
||||||
|
if o == nil {
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
return resourceRequirements{
|
||||||
|
memory: maxUnitBytes(r.memory, o.memory),
|
||||||
|
cpus: math.Max(r.cpus, o.cpus),
|
||||||
|
gpus: maxInt64(r.gpus, o.gpus),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r eitherRequirementsOrError) max() (*resourceRequirements, error) {
|
||||||
|
if r.err != nil {
|
||||||
|
return nil, r.err
|
||||||
|
}
|
||||||
|
min := resourceRequirements{}
|
||||||
|
for _, req := range r.requirements {
|
||||||
|
min = min.combine(req)
|
||||||
|
}
|
||||||
|
return &min, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func maxInt64(a, b int64) int64 {
|
||||||
|
if a > b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
func maxUnitBytes(a, b types.UnitBytes) types.UnitBytes {
|
||||||
|
if a > b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
}
|
}
|
||||||
|
@ -28,63 +28,63 @@ func TestGuessMachineType(t *testing.T) {
|
|||||||
wantErr bool
|
wantErr bool
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "1-gpu",
|
name: "1-gpus",
|
||||||
yaml: `
|
yaml: `
|
||||||
services:
|
services:
|
||||||
learning:
|
learning:
|
||||||
image: tensorflow/tensorflow:latest-gpu
|
image: tensorflow/tensorflow:latest-gpus
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
generic_resources:
|
generic_resources:
|
||||||
- discrete_resource_spec:
|
- discrete_resource_spec:
|
||||||
kind: gpu
|
kind: gpus
|
||||||
value: 1
|
value: 1
|
||||||
`,
|
`,
|
||||||
want: "p3.2xlarge",
|
want: "p3.2xlarge",
|
||||||
wantErr: false,
|
wantErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "4-gpu",
|
name: "4-gpus",
|
||||||
yaml: `
|
yaml: `
|
||||||
services:
|
services:
|
||||||
learning:
|
learning:
|
||||||
image: tensorflow/tensorflow:latest-gpu
|
image: tensorflow/tensorflow:latest-gpus
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
generic_resources:
|
generic_resources:
|
||||||
- discrete_resource_spec:
|
- discrete_resource_spec:
|
||||||
kind: gpu
|
kind: gpus
|
||||||
value: 4
|
value: 4
|
||||||
`,
|
`,
|
||||||
want: "p3.8xlarge",
|
want: "p3.8xlarge",
|
||||||
wantErr: false,
|
wantErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "1-gpu, high-memory",
|
name: "1-gpus, high-memory",
|
||||||
yaml: `
|
yaml: `
|
||||||
services:
|
services:
|
||||||
learning:
|
learning:
|
||||||
image: tensorflow/tensorflow:latest-gpu
|
image: tensorflow/tensorflow:latest-gpus
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
memory: 300Gb
|
memory: 300Gb
|
||||||
generic_resources:
|
generic_resources:
|
||||||
- discrete_resource_spec:
|
- discrete_resource_spec:
|
||||||
kind: gpu
|
kind: gpus
|
||||||
value: 2
|
value: 2
|
||||||
`,
|
`,
|
||||||
want: "p3.16xlarge",
|
want: "p3.16xlarge",
|
||||||
wantErr: false,
|
wantErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "1-gpu, high-cpu",
|
name: "1-gpus, high-cpu",
|
||||||
yaml: `
|
yaml: `
|
||||||
services:
|
services:
|
||||||
learning:
|
learning:
|
||||||
image: tensorflow/tensorflow:latest-gpu
|
image: tensorflow/tensorflow:latest-gpus
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
@ -92,7 +92,7 @@ services:
|
|||||||
cpus: "32"
|
cpus: "32"
|
||||||
generic_resources:
|
generic_resources:
|
||||||
- discrete_resource_spec:
|
- discrete_resource_spec:
|
||||||
kind: gpu
|
kind: gpus
|
||||||
value: 2
|
value: 2
|
||||||
`,
|
`,
|
||||||
want: "p3.8xlarge",
|
want: "p3.8xlarge",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user