add(plugin): nvidia gpu smi (#3275)
This commit is contained in:
parent
51bc3f2efe
commit
5f6505004c
|
@ -441,6 +441,7 @@ sub run_group {
|
|||
$self->{output}->output_add(
|
||||
long_msg => $self->call_object_callback(
|
||||
method_name => $options{config}->{cb_long_output},
|
||||
instance => $id,
|
||||
instance_value => $self->{$options{config}->{name}}->{$id}
|
||||
)
|
||||
);
|
||||
|
@ -632,6 +633,7 @@ sub run_multiple {
|
|||
$self->{output}->output_add(
|
||||
long_msg => $self->call_object_callback(
|
||||
method_name => $options{config}->{cb_long_output},
|
||||
instance => $instance,
|
||||
instance_value => $self->{$options{config}->{name}}->{$instance}
|
||||
)
|
||||
);
|
||||
|
|
|
@ -0,0 +1,360 @@
|
|||
#
|
||||
# Copyright 2021 Centreon (http://www.centreon.com/)
|
||||
#
|
||||
# Centreon is a full-fledged industry-strength solution that meets
|
||||
# the needs in IT infrastructure and application monitoring for
|
||||
# service performance.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
package hardware::devices::nvidia::gpu::smi::mode::stats;
|
||||
|
||||
use base qw(centreon::plugins::templates::counter);
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use XML::LibXML::Simple;
|
||||
|
||||
sub custom_memory_usage_output {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
my ($total_size_value, $total_size_unit) = $self->{perfdata}->change_bytes(value => $self->{result_values}->{total});
|
||||
my ($total_used_value, $total_used_unit) = $self->{perfdata}->change_bytes(value => $self->{result_values}->{used});
|
||||
my ($total_free_value, $total_free_unit) = $self->{perfdata}->change_bytes(value => $self->{result_values}->{free});
|
||||
return sprintf(
|
||||
"memory usage total: %s used: %s (%.2f%%) free: %s (%.2f%%)",
|
||||
$total_size_value . " " . $total_size_unit,
|
||||
$total_used_value . " " . $total_used_unit, $self->{result_values}->{prct_used},
|
||||
$total_free_value . " " . $total_free_unit, $self->{result_values}->{prct_free}
|
||||
);
|
||||
}
|
||||
|
||||
sub device_long_output {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
return "checking device gpu '" . $options{instance} . "'";
|
||||
}
|
||||
|
||||
sub prefix_device_output {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
return "Device gpu '" . $options{instance} . "' ";
|
||||
}
|
||||
|
||||
sub prefix_util_output {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
return 'utilization ';
|
||||
}
|
||||
|
||||
sub prefix_fb_output {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
return 'frame buffer ';
|
||||
}
|
||||
|
||||
sub prefix_bar1_output {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
return 'bar1 ';
|
||||
}
|
||||
|
||||
sub set_counters {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
$self->{maps_counters_type} = [
|
||||
{ name => 'global', type => 0, skipped_code => { -10 => 1 } },
|
||||
{ name => 'devices', type => 3, cb_prefix_output => 'prefix_device_output', cb_long_output => 'device_long_output', indent_long_output => ' ', message_multiple => 'All devices are ok',
|
||||
group => [
|
||||
{ name => 'util', cb_prefix_output => 'prefix_util_output', type => 0, skipped_code => { -10 => 1 } },
|
||||
{ name => 'fb', type => 0, cb_prefix_output => 'prefix_fb_output', skipped_code => { -10 => 1 } },
|
||||
{ name => 'bar1', type => 0, cb_prefix_output => 'prefix_bar1_output', skipped_code => { -10 => 1 } },
|
||||
{ name => 'fan', type => 0, skipped_code => { -10 => 1 } },
|
||||
{ name => 'temp', type => 0, skipped_code => { -10 => 1 } },
|
||||
{ name => 'power', type => 0, skipped_code => { -10 => 1 } }
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
$self->{maps_counters}->{global} = [
|
||||
{ label => 'devices-gpu-total', nlabel => 'devices.gpu.total.count', display_ok => 0, set => {
|
||||
key_values => [ { name => 'devices'} ],
|
||||
output_template => 'total gpu devices: %s',
|
||||
perfdatas => [
|
||||
{ template => '%s', min => 0 },
|
||||
]
|
||||
}
|
||||
}
|
||||
];
|
||||
|
||||
$self->{maps_counters}->{util} = [
|
||||
{ label => 'gpu-utilization', nlabel => 'device.gpu.utilization.percentage', set => {
|
||||
key_values => [ { name => 'gpu_util' } ],
|
||||
output_template => 'gpu: %.2f %%',
|
||||
perfdatas => [
|
||||
{ template => '%.2f', min => 0, max => 100, unit => '%', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
{ label => 'gpu-memory-utilization', nlabel => 'device.gpu.memory.utilization.percentage', set => {
|
||||
key_values => [ { name => 'mem_util' } ],
|
||||
output_template => 'memory: %.2f %%',
|
||||
perfdatas => [
|
||||
{ template => '%.2f', min => 0, max => 100, unit => '%', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
{ label => 'gpu-encoder-utilization', nlabel => 'device.gpu.encoder.utilization.percentage', set => {
|
||||
key_values => [ { name => 'encoder_util' } ],
|
||||
output_template => 'encoder: %.2f %%',
|
||||
perfdatas => [
|
||||
{ template => '%.2f', min => 0, max => 100, unit => '%', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
{ label => 'gpu-decoder-utilization', nlabel => 'device.gpu.decoder.utilization.percentage', set => {
|
||||
key_values => [ { name => 'decoder_util' } ],
|
||||
output_template => 'decoder: %.2f %%',
|
||||
perfdatas => [
|
||||
{ template => '%.2f', min => 0, max => 100, unit => '%', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
];
|
||||
|
||||
$self->{maps_counters}->{fb} = [
|
||||
{ label => 'fb-memory-usage', nlabel => 'device.gpu.frame_buffer.memory.usage.bytes', set => {
|
||||
key_values => [ { name => 'used' }, { name => 'free' }, { name => 'prct_used' }, { name => 'prct_free' }, { name => 'total' } ],
|
||||
closure_custom_output => $self->can('custom_memory_usage_output'),
|
||||
perfdatas => [
|
||||
{ template => '%d', min => 0, max => 'total', unit => 'B', cast_int => 1, label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
{ label => 'fb-memory-usage-free', display_ok => 0, nlabel => 'device.gpu.frame_buffer.memory.free.bytes', set => {
|
||||
key_values => [ { name => 'free' }, { name => 'used' }, { name => 'prct_used' }, { name => 'prct_free' }, { name => 'total' } ],
|
||||
closure_custom_output => $self->can('custom_memory_usage_output'),
|
||||
perfdatas => [
|
||||
{ template => '%d', min => 0, max => 'total', unit => 'B', cast_int => 1, label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
{ label => 'fb-memory-usage-prct', display_ok => 0, nlabel => 'device.gpu.frame_buffer.memory.usage.percentage', set => {
|
||||
key_values => [ { name => 'prct_used' }, { name => 'free' }, { name => 'used' }, { name => 'prct_free' }, { name => 'total' } ],
|
||||
closure_custom_output => $self->can('custom_memory_usage_output'),
|
||||
perfdatas => [
|
||||
{ template => '%.2f', min => 0, max => 100, unit => '%', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
];
|
||||
|
||||
$self->{maps_counters}->{bar1} = [
|
||||
{ label => 'bar1-memory-usage', nlabel => 'device.gpu.bar1.memory.usage.bytes', set => {
|
||||
key_values => [ { name => 'used' }, { name => 'free' }, { name => 'prct_used' }, { name => 'prct_free' }, { name => 'total' } ],
|
||||
closure_custom_output => $self->can('custom_memory_usage_output'),
|
||||
perfdatas => [
|
||||
{ template => '%d', min => 0, max => 'total', unit => 'B', cast_int => 1, label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
{ label => 'bar1-memory-usage-free', display_ok => 0, nlabel => 'device.gpu.bar1.memory.free.bytes', set => {
|
||||
key_values => [ { name => 'free' }, { name => 'used' }, { name => 'prct_used' }, { name => 'prct_free' }, { name => 'total' } ],
|
||||
closure_custom_output => $self->can('custom_memory_usage_output'),
|
||||
perfdatas => [
|
||||
{ template => '%d', min => 0, max => 'total', unit => 'B', cast_int => 1, label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
{ label => 'bar1-memory-usage-prct', display_ok => 0, nlabel => 'device.gpu.bar1.memory.usage.percentage', set => {
|
||||
key_values => [ { name => 'prct_used' }, { name => 'free' }, { name => 'used' }, { name => 'prct_free' }, { name => 'total' } ],
|
||||
closure_custom_output => $self->can('custom_memory_usage_output'),
|
||||
perfdatas => [
|
||||
{ template => '%.2f', min => 0, max => 100, unit => '%', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
];
|
||||
|
||||
$self->{maps_counters}->{fan} = [
|
||||
{ label => 'fan-speed', nlabel => 'device.gpu.fan.speed.percentage', set => {
|
||||
key_values => [ { name => 'speed' } ],
|
||||
output_template => 'fan speed: %.2f %%',
|
||||
perfdatas => [
|
||||
{ template => '%.2f', min => 0, max => 100, unit => '%', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
];
|
||||
|
||||
$self->{maps_counters}->{temp} = [
|
||||
{ label => 'temperature', nlabel => 'device.gpu.temperature.celsius', set => {
|
||||
key_values => [ { name => 'current' } ],
|
||||
output_template => 'gpu temperature: %s C',
|
||||
perfdatas => [
|
||||
{ template => '%s', unit => 'C', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
];
|
||||
|
||||
$self->{maps_counters}->{power} = [
|
||||
{ label => 'power', nlabel => 'device.gpu.power.consumption.watt', set => {
|
||||
key_values => [ { name => 'current' } ],
|
||||
output_template => 'power consumption: %s W',
|
||||
perfdatas => [
|
||||
{ template => '%s', min => 0, unit => 'W', label_extra_instance => 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
sub new {
|
||||
my ($class, %options) = @_;
|
||||
my $self = $class->SUPER::new(package => __PACKAGE__, %options, force_new_perfdata => 1);
|
||||
bless $self, $class;
|
||||
|
||||
$options{options}->add_options(arguments => {
|
||||
'filter-name:s' => { name => 'filter_name' }
|
||||
});
|
||||
|
||||
return $self;
|
||||
}
|
||||
|
||||
sub get_bytes {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
return undef if ($options{value} !~ /(\d+)\s*([a-zA-Z]+)/);
|
||||
my ($value, $unit) = ($1, $2);
|
||||
if ($unit =~ /KiB*/i) {
|
||||
$value = $value * 1024;
|
||||
} elsif ($unit =~ /MiB*/i) {
|
||||
$value = $value * 1024 * 1024;
|
||||
} elsif ($unit =~ /GiB*/i) {
|
||||
$value = $value * 1024 * 1024 * 1024;
|
||||
} elsif ($unit =~ /TiB*/i) {
|
||||
$value = $value * 1024 * 1024 * 1024 * 1024;
|
||||
}
|
||||
|
||||
return $value;
|
||||
}
|
||||
|
||||
sub manage_selection {
|
||||
my ($self, %options) = @_;
|
||||
|
||||
my ($stdout) = $options{custom}->execute_command(
|
||||
command => 'nvidia-smi',
|
||||
command_options => '-q -x'
|
||||
);
|
||||
|
||||
my $decoded;
|
||||
eval {
|
||||
$SIG{__WARN__} = sub {};
|
||||
$decoded = XMLin($stdout, KeyAttr => [], ForceArray => ['gpu']);
|
||||
};
|
||||
if ($@) {
|
||||
$self->{output}->add_option_msg(short_msg => "Cannot decode xml response: $@");
|
||||
$self->{output}->option_exit();
|
||||
}
|
||||
|
||||
$self->{global} = { devices => 0 };
|
||||
$self->{devices} = {};
|
||||
foreach my $entry (@{$decoded->{gpu}}) {
|
||||
my $name = $entry->{product_name} . ':' . $entry->{id};
|
||||
if (defined($self->{option_results}->{filter_name}) && $self->{option_results}->{filter_name} ne '' &&
|
||||
$name !~ /$self->{option_results}->{filter_name}/) {
|
||||
$self->{output}->output_add(long_msg => "skipping device '" . $name . "'.", debug => 1);
|
||||
next;
|
||||
}
|
||||
|
||||
$self->{devices}->{$name} = { util => {} };
|
||||
if (defined($entry->{utilization}->{gpu_util}) && $entry->{utilization}->{gpu_util} =~ /([0-9\.]+)\s*%/) {
|
||||
$self->{devices}->{$name}->{util}->{gpu_util} = $1;
|
||||
}
|
||||
if (defined($entry->{utilization}->{memory_util}) && $entry->{utilization}->{memory_util} =~ /([0-9\.]+)\s*%/) {
|
||||
$self->{devices}->{$name}->{util}->{mem_util} = $1;
|
||||
}
|
||||
if (defined($entry->{utilization}->{encoder_util}) && $entry->{utilization}->{encoder_util} =~ /([0-9\.]+)\s*%/) {
|
||||
$self->{devices}->{$name}->{util}->{encoder_util} = $1;
|
||||
}
|
||||
if (defined($entry->{utilization}->{decoder_util}) && $entry->{utilization}->{decoder_util} =~ /([0-9\.]+)\s*%/) {
|
||||
$self->{devices}->{$name}->{util}->{decoder_util} = $1;
|
||||
}
|
||||
if (defined($entry->{fb_memory_usage})) {
|
||||
my $total = $self->get_bytes(value => $entry->{fb_memory_usage}->{total});
|
||||
my $used = $self->get_bytes(value => $entry->{fb_memory_usage}->{used});
|
||||
my $free = $self->get_bytes(value => $entry->{fb_memory_usage}->{free});
|
||||
$self->{devices}->{$name}->{fb} = {
|
||||
total => $total,
|
||||
used => $used,
|
||||
free => $free,
|
||||
prct_used => $used * 100 / $total,
|
||||
prct_free => 100 - ($used * 100 / $total)
|
||||
};
|
||||
}
|
||||
if (defined($entry->{bar1_memory_usage})) {
|
||||
my $total = $self->get_bytes(value => $entry->{bar1_memory_usage}->{total});
|
||||
my $used = $self->get_bytes(value => $entry->{bar1_memory_usage}->{used});
|
||||
my $free = $self->get_bytes(value => $entry->{bar1_memory_usage}->{free});
|
||||
$self->{devices}->{$name}->{bar1} = {
|
||||
total => $total,
|
||||
used => $used,
|
||||
free => $free,
|
||||
prct_used => $used * 100 / $total,
|
||||
prct_free => 100 - ($used * 100 / $total)
|
||||
};
|
||||
}
|
||||
if (defined($entry->{fan_speed}) && $entry->{fan_speed} =~ /([0-9\.]+)\s*%/) {
|
||||
$self->{devices}->{$name}->{fan} = { speed => $1 };
|
||||
}
|
||||
if (defined($entry->{temperature}) && $entry->{temperature}->{gpu_temp} =~ /([0-9\.]+)\s*C/) {
|
||||
$self->{devices}->{$name}->{temp} = { current => $1 };
|
||||
}
|
||||
if (defined($entry->{power_readings}) && $entry->{power_readings}->{power_draw} =~ /([0-9\.]+)\s*W/) {
|
||||
$self->{devices}->{$name}->{power} = { current => $1 };
|
||||
}
|
||||
|
||||
$self->{global}->{devices}++;
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 MODE
|
||||
|
||||
Check GPU statistics.
|
||||
|
||||
Command used: nvidia-smi -q -x
|
||||
|
||||
=over 8
|
||||
|
||||
=item B<--filter-name>
|
||||
|
||||
Filter gpu devices by name (can be a regexp).
|
||||
|
||||
=item B<--warning-*> B<--critical-*>
|
||||
|
||||
Thresholds.
|
||||
Can be: 'devices-gpu-total',
|
||||
'bar1-memory-usage', 'bar1-memory-usage-free', 'bar1-memory-usage-prct',
|
||||
'fb-memory-usage', 'fb-memory-usage-free', 'fb-memory-usage-prct',
|
||||
'gpu-utilization', 'gpu-memory-utilization', 'gpu-encoder-utilization', 'gpu-decoder-utilization',
|
||||
'temperature', 'fan-speed', 'power'.
|
||||
|
||||
=back
|
||||
|
||||
=cut
|
|
@ -0,0 +1,50 @@
|
|||
#
|
||||
# Copyright 2021 Centreon (http://www.centreon.com/)
|
||||
#
|
||||
# Centreon is a full-fledged industry-strength solution that meets
|
||||
# the needs in IT infrastructure and application monitoring for
|
||||
# service performance.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
package hardware::devices::nvidia::gpu::smi::plugin;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use base qw(centreon::plugins::script_custom);
|
||||
|
||||
sub new {
|
||||
my ($class, %options) = @_;
|
||||
my $self = $class->SUPER::new(package => __PACKAGE__, %options);
|
||||
bless $self, $class;
|
||||
|
||||
$self->{version} = '0.1';
|
||||
$self->{modes} = {
|
||||
'stats' => 'hardware::devices::nvidia::gpu::smi::mode::stats'
|
||||
};
|
||||
|
||||
$self->{custom_modes}->{cli} = 'centreon::plugins::script_custom::cli';
|
||||
|
||||
return $self;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 PLUGIN DESCRIPTION
|
||||
|
||||
Check NVIDIA GPU devices using system management interface program (smi).
|
||||
|
||||
=cut
|
Loading…
Reference in New Issue