feature: on Linux, check coretemp and don't fail fast with name generation when gathering temps (#1188)

* feature: also check coretemp on Linux when gathering temps

* update changelog

* add comment

* add logic to check thermal zone if there are no hwmon entries

* cleanup

* handle duplicates between hwmon and thermal zone

* Revert "handle duplicates between hwmon and thermal zone"

This reverts commit 402606cc62f27ba196ef2ade6a669ae84aedf0e0.

* cleanup

* back to running thermal zone if no hwmon

* prevent failure cases

* temp logging

* name refactoring

* port dupe name logic to hwmon

* Cleanup, add fallback to sensor name

* more tests

* Fix log

* update changelog

* cleanup and more tests
This commit is contained in:
Clement Tsang 2023-06-11 22:12:15 +00:00 committed by GitHub
parent e602cc7a39
commit 9560722318
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 368 additions and 185 deletions

View File

@ -12,11 +12,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#1186](https://github.com/ClementTsang/bottom/pull/1186): Fix for temperature sensor data gathering on Linux immediately halting if any method failed.
- [#1191](https://github.com/ClementTsang/bottom/pull/1191): Fix ntfs3 mounts not being counted as a physical drive type.
- [#1195](https://github.com/ClementTsang/bottom/pull/1195): Fix battery health being incorrectly reported on M1 macOS.
- [#1188](https://github.com/ClementTsang/bottom/pull/1188): Don't fail fast with temperature sensor name generation on Linux.
## Features
- [#1172](https://github.com/ClementTsang/bottom/pull/1172): Support human times for `time_delta` and `default_time_value`.
- [#1187](https://github.com/ClementTsang/bottom/pull/1187): Use better names for duplicate temp sensors found by `/sys/class/thermal`.
- [#1188](https://github.com/ClementTsang/bottom/pull/1188): Also check `/sys/devices/platform/coretemp.*` for temp sensors.
## Other

View File

@ -1,9 +1,12 @@
//! Gets temperature sensor data for Linux platforms.
use std::{fs, path::Path};
use std::{
fs,
path::{Path, PathBuf},
};
use anyhow::{anyhow, Result};
use hashbrown::HashMap;
use anyhow::Result;
use hashbrown::{HashMap, HashSet};
use super::{is_temp_filtered, TempHarvest, TemperatureType};
use crate::app::{
@ -11,14 +14,153 @@ use crate::app::{
Filter,
};
/// Get temperature sensors from the linux sysfs interface `/sys/class/hwmon`.
const EMPTY_NAME: &str = "Unknown";
/// Returned results from grabbing hwmon/coretemp temperature sensor values/names.
struct HwmonResults {
temperatures: Vec<TempHarvest>,
num_hwmon: usize,
}
/// Parses and reads temperatures that were in millidegree Celsius, and if successful, returns a temperature in Celsius.
fn read_temp(path: &Path) -> Result<f32> {
Ok(fs::read_to_string(path)?
.trim_end()
.parse::<f32>()
.map_err(|e| crate::utils::error::BottomError::ConversionError(e.to_string()))?
/ 1_000.0)
}
fn convert_temp_unit(temp: f32, temp_type: &TemperatureType) -> f32 {
match temp_type {
TemperatureType::Celsius => temp,
TemperatureType::Kelvin => convert_celsius_to_kelvin(temp),
TemperatureType::Fahrenheit => convert_celsius_to_fahrenheit(temp),
}
}
/// Get all candidates from hwmon and coretemp. It will also return the number of entries from hwmon.
fn get_hwmon_candidates() -> (HashSet<PathBuf>, usize) {
let mut dirs = HashSet::default();
if let Ok(read_dir) = Path::new("/sys/class/hwmon").read_dir() {
for entry in read_dir.flatten() {
let mut path = entry.path();
// hwmon includes many sensors, we only want ones with at least one temperature sensor
// Reading this file will wake the device, but we're only checking existence, so it should be fine.
if !path.join("temp1_input").exists() {
// Note we also check for a `device` subdirectory (e.g. `/sys/class/hwmon/hwmon*/device/`).
// This is needed for CentOS, which adds this extra `/device` directory. See:
// - https://github.com/nicolargo/glances/issues/1060
// - https://github.com/giampaolo/psutil/issues/971
// - https://github.com/giampaolo/psutil/blob/642438375e685403b4cd60b0c0e25b80dd5a813d/psutil/_pslinux.py#L1316
//
// If it does match, then add the `device/` directory to the path.
if path.join("device/temp1_input").exists() {
path.push("device");
}
}
dirs.insert(path);
}
}
let num_hwmon = dirs.len();
if let Ok(read_dir) = Path::new("/sys/devices/platform").read_dir() {
for entry in read_dir.flatten() {
if entry.file_name().to_string_lossy().starts_with("coretemp.") {
if let Ok(read_dir) = entry.path().join("hwmon").read_dir() {
for entry in read_dir.flatten() {
let path = entry.path();
if path.join("temp1_input").exists() {
// It's possible that there are dupes (represented by symlinks) - the easy
// way is to just substitute the parent directory and check if the hwmon
// variant exists already in a set.
//
// For more info, see https://github.com/giampaolo/psutil/pull/1822/files
if let Some(child) = path.file_name() {
let to_check_path = Path::new("/sys/class/hwmon").join(child);
if !dirs.contains(&to_check_path) {
dirs.insert(path);
}
}
}
}
}
}
}
}
(dirs, num_hwmon)
}
#[inline]
fn read_to_string_lossy<P: AsRef<Path>>(path: P) -> Option<String> {
fs::read(path)
.map(|v| String::from_utf8_lossy(&v).trim().to_string())
.ok()
}
#[inline]
fn humanize_name(name: String, sensor_name: Option<&String>) -> String {
match sensor_name {
Some(ty) => format!("{name} ({ty})"),
None => name,
}
}
#[inline]
fn counted_name(seen_names: &mut HashMap<String, u32>, name: String) -> String {
let name = if name.is_empty() {
EMPTY_NAME.to_string()
} else {
name
};
if let Some(count) = seen_names.get_mut(&name) {
*count += 1;
format!("{name} ({})", *count)
} else {
seen_names.insert(name.clone(), 0);
name
}
}
#[inline]
fn finalize_name(
hwmon_name: Option<String>, sensor_label: Option<String>,
fallback_sensor_name: &Option<String>, seen_names: &mut HashMap<String, u32>,
) -> String {
let candidate_name = match (hwmon_name, sensor_label) {
(Some(name), Some(label)) => format!("{name}: {label}"),
(None, Some(label)) => match &fallback_sensor_name {
Some(sensor_name) => format!("{sensor_name}: {label}"),
None => label,
},
(Some(name), None) => name,
(None, None) => match &fallback_sensor_name {
Some(sensor_name) => sensor_name.clone(),
None => EMPTY_NAME.to_string(),
},
};
counted_name(seen_names, candidate_name)
}
/// Get temperature sensors from the linux sysfs interface `/sys/class/hwmon` and
/// `/sys/devices/platform/coretemp.*`. It returns all found temperature sensors, and the number
/// of checked hwmon directories (not coretemp directories).
///
/// See [the Linux kernel documentation](https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-hwmon)
/// for more details.
/// For more details, see the relevant Linux kernel documentation:
/// - [`/sys/class/hwmon`](https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-hwmon)
/// - [`/sys/devices/platform/coretemp.*`](https://www.kernel.org/doc/html/v5.14/hwmon/coretemp.html)
///
/// This method will return `0` as the temperature for devices, such as GPUs,
/// that support power management features and power themselves off.
///
/// that support power management features that have powered themselves off.
/// Specifically, in laptops with iGPUs and dGPUs, if the dGPU is capable of
/// entering ACPI D3cold, reading the temperature sensors will wake it,
/// and keep it awake, wasting power.
@ -27,11 +169,11 @@ use crate::app::{
/// the device is already in ACPI D0. This has the notable issue that
/// once this happens, the device will be *kept* on through the sensor
/// reading, and not be able to re-enter ACPI D3cold.
fn get_from_hwmon(
temp_type: &TemperatureType, filter: &Option<Filter>,
) -> Result<Vec<TempHarvest>> {
let mut temperature_vec: Vec<TempHarvest> = vec![];
let path = Path::new("/sys/class/hwmon");
fn hwmon_temperatures(temp_type: &TemperatureType, filter: &Option<Filter>) -> HwmonResults {
let mut temperatures: Vec<TempHarvest> = vec![];
let mut seen_names: HashMap<String, u32> = HashMap::new();
let (dirs, num_hwmon) = get_hwmon_candidates();
// Note that none of this is async if we ever go back to it, but sysfs is in
// memory, so in theory none of this should block if we're slightly careful.
@ -43,230 +185,268 @@ fn get_from_hwmon(
// will not wake the device, and thus not block,
// and meaning no sensors have to be hidden depending on `power_state`
//
// It would probably be more ideal to use a proper async runtime..
for entry in path.read_dir()? {
let file = entry?;
let mut file_path = file.path();
// It would probably be more ideal to use a proper async runtime; this would also allow easy cancellation/timeouts.
for file_path in dirs {
let sensor_name = read_to_string_lossy(file_path.join("name"));
// hwmon includes many sensors, we only want ones with at least one temperature sensor
// Reading this file will wake the device, but we're only checking existence.
if !file_path.join("temp1_input").exists() {
// Note we also check for a `device` subdirectory (e.g. `/sys/class/hwmon/hwmon*/device/`).
// This is needed for CentOS, which adds this extra `/device` directory. See:
// - https://github.com/nicolargo/glances/issues/1060
// - https://github.com/giampaolo/psutil/issues/971
// - https://github.com/giampaolo/psutil/blob/642438375e685403b4cd60b0c0e25b80dd5a813d/psutil/_pslinux.py#L1316
//
// If it does match, then add the `device/` directory to the path.
if file_path.join("device/temp1_input").exists() {
file_path.push("device");
} else {
continue;
}
}
let hwmon_name = file_path.join("name");
let hwmon_name = Some(fs::read_to_string(hwmon_name)?);
// Whether the temperature should *actually* be read during enumeration
// Whether the temperature should *actually* be read during enumeration.
// Set to false if the device is in ACPI D3cold.
//
// If it is false, then the temperature will be set to 0.0 later down the line.
let should_read_temp = {
// Documented at https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-power_state
let device = file_path.join("device");
let power_state = device.join("power_state");
if power_state.exists() {
let state = fs::read_to_string(power_state)?;
let state = state.trim();
// The zenpower3 kernel module (incorrectly?) reports "unknown"
// causing this check to fail and temperatures to appear as zero
// instead of having the file not exist..
// their self-hosted git instance has disabled sign up,
// so this bug cant be reported either.
state == "D0" || state == "unknown"
if let Ok(state) = fs::read_to_string(power_state) {
let state = state.trim();
// The zenpower3 kernel module (incorrectly?) reports "unknown", causing this check
// to fail and temperatures to appear as zero instead of having the file not exist.
//
// Their self-hosted git instance has disabled sign up, so this bug cant be reported either.
state == "D0" || state == "unknown"
} else {
true
}
} else {
true
}
};
// Enumerate the devices temperature sensors
for entry in file_path.read_dir()? {
let file = entry?;
let name = file.file_name();
// This should always be ASCII
let name = name
.to_str()
.ok_or_else(|| anyhow!("temperature device filenames should be ASCII"))?;
// We only want temperature sensors, skip others early
if !(name.starts_with("temp") && name.ends_with("input")) {
continue;
}
let temp = file.path();
let temp_label = file_path.join(name.replace("input", "label"));
let temp_label = fs::read_to_string(temp_label).ok();
if let Ok(dir_entries) = file_path.read_dir() {
// Enumerate the devices temperature sensors
for file in dir_entries.flatten() {
let name = file.file_name();
let name = name.to_string_lossy();
// Do some messing around to get a more sensible name for sensors
//
// - For GPUs, this will use the kernel device name, ex `card0`
// - For nvme drives, this will also use the kernel name, ex `nvme0`.
// This is found differently than for GPUs
// - For whatever acpitz is, on my machine this is now `thermal_zone0`.
// - For k10temp, this will still be k10temp, but it has to be handled special.
let human_hwmon_name = {
let device = file_path.join("device");
// This will exist for GPUs but not others, this is how
// we find their kernel name
let drm = device.join("drm");
if drm.exists() {
// This should never actually be empty
let mut gpu = None;
for card in drm.read_dir()? {
let card = card?;
let name = card.file_name().to_str().unwrap_or_default().to_owned();
if name.starts_with("card") {
if let Some(hwmon_name) = hwmon_name.as_ref() {
gpu = Some(format!("{} ({})", name, hwmon_name.trim()));
} else {
gpu = Some(name)
}
break;
}
}
gpu
} else {
// This little mess is to account for stuff like k10temp
// This is needed because the `device` symlink
// points to `nvme*` for nvme drives, but to PCI buses for anything else
// If the first character is alphabetic,
// its an actual name like k10temp or nvme0, not a PCI bus
let link = fs::read_link(device)?
.file_name()
.map(|f| f.to_str().unwrap_or_default().to_owned())
.unwrap();
if link.as_bytes()[0].is_ascii_alphabetic() {
if let Some(hwmon_name) = hwmon_name.as_ref() {
Some(format!("{} ({})", link, hwmon_name.trim()))
} else {
Some(link)
}
} else {
hwmon_name.clone()
}
// We only want temperature sensors, skip others early
if !(name.starts_with("temp") && name.ends_with("input")) {
continue;
}
};
let name = match (&human_hwmon_name, &temp_label) {
(Some(name), Some(label)) => format!("{}: {}", name.trim(), label.trim()),
(None, Some(label)) => label.to_string(),
(Some(name), None) => name.to_string(),
(None, None) => String::default(),
};
let temp_path = file.path();
let sensor_label_path = file_path.join(name.replace("input", "label"));
let sensor_label = read_to_string_lossy(sensor_label_path);
if is_temp_filtered(filter, &name) {
let temp = if should_read_temp {
if let Ok(temp) = fs::read_to_string(temp) {
let temp = temp.trim_end().parse::<f32>().map_err(|e| {
crate::utils::error::BottomError::ConversionError(e.to_string())
})?;
temp / 1_000.0
// Do some messing around to get a more sensible name for sensors:
// - For GPUs, this will use the kernel device name, ex `card0`
// - For nvme drives, this will also use the kernel name, ex `nvme0`.
// This is found differently than for GPUs
// - For whatever acpitz is, on my machine this is now `thermal_zone0`.
// - For k10temp, this will still be k10temp, but it has to be handled special.
let hwmon_name = {
let device = file_path.join("device");
// This will exist for GPUs but not others, this is how we find their kernel name.
let drm = device.join("drm");
if drm.exists() {
// This should never actually be empty. If it is though, we'll fall back to the sensor name
// later on.
let mut gpu = None;
if let Ok(cards) = drm.read_dir() {
for card in cards.flatten() {
if let Some(name) = card.file_name().to_str() {
if name.starts_with("card") {
gpu = Some(humanize_name(
name.trim().to_string(),
sensor_name.as_ref(),
));
break;
}
}
}
}
gpu
} else {
// For some devices (e.g. iwlwifi), this file becomes empty when the device
// is disabled. In this case we skip the device.
continue;
// This little mess is to account for stuff like k10temp. This is needed because the
// `device` symlink points to `nvme*` for nvme drives, but to PCI buses for anything
// else. If the first character is alphabetic, it's an actual name like k10temp or
// nvme0, not a PCI bus.
fs::read_link(device).ok().and_then(|link| {
let link = link
.file_name()
.and_then(|f| f.to_str())
.map(|s| s.trim().to_owned());
match link {
Some(link) if link.as_bytes()[0].is_ascii_alphabetic() => {
Some(humanize_name(link, sensor_name.as_ref()))
}
_ => None,
}
})
}
} else {
0.0
};
temperature_vec.push(TempHarvest {
name,
temperature: match temp_type {
TemperatureType::Celsius => temp,
TemperatureType::Kelvin => convert_celsius_to_kelvin(temp),
TemperatureType::Fahrenheit => convert_celsius_to_fahrenheit(temp),
},
});
let name = finalize_name(hwmon_name, sensor_label, &sensor_name, &mut seen_names);
if is_temp_filtered(filter, &name) {
let temp = if should_read_temp {
if let Ok(temp) = read_temp(&temp_path) {
temp
} else {
continue;
}
} else {
0.0
};
temperatures.push(TempHarvest {
name,
temperature: convert_temp_unit(temp, temp_type),
});
}
}
}
}
Ok(temperature_vec)
HwmonResults {
temperatures,
num_hwmon,
}
}
/// Gets data from `/sys/class/thermal/thermal_zone*`. This should only be used if
/// [`get_from_hwmon`] doesn't return anything.
/// [`hwmon_temperatures`] doesn't return anything.
///
/// See [the Linux kernel documentation](https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-thermal)
/// for more details.
fn get_from_thermal_zone(
temp_type: &TemperatureType, filter: &Option<Filter>,
) -> Result<Vec<TempHarvest>> {
let mut temperatures = vec![];
fn add_thermal_zone_temperatures(
temperatures: &mut Vec<TempHarvest>, temp_type: &TemperatureType, filter: &Option<Filter>,
) {
let path = Path::new("/sys/class/thermal");
let Ok(read_dir) = path.read_dir() else {
return
};
let mut seen_names: HashMap<String, u32> = HashMap::new();
for entry in path.read_dir()? {
let file = entry?;
if file
for entry in read_dir.flatten() {
if entry
.file_name()
.to_string_lossy()
.starts_with("thermal_zone")
{
let file_path = file.path();
let file_path = entry.path();
let name_path = file_path.join("type");
let name = fs::read_to_string(name_path)?;
let name = name.trim_end();
if let Some(name) = read_to_string_lossy(name_path) {
if is_temp_filtered(filter, &name) {
let temp_path = file_path.join("temp");
if let Ok(temp) = read_temp(&temp_path) {
let name = counted_name(&mut seen_names, name);
if is_temp_filtered(filter, name) {
let temp_path = file_path.join("temp");
let temp = fs::read_to_string(temp_path)?
.trim_end()
.parse::<f32>()
.map_err(|e| {
crate::utils::error::BottomError::ConversionError(e.to_string())
})?
/ 1_000.0;
let name = if let Some(count) = seen_names.get_mut(name) {
*count += 1;
format!("{name} ({})", *count)
} else {
seen_names.insert(name.to_string(), 0);
name.to_string()
};
temperatures.push(TempHarvest {
name,
temperature: match temp_type {
TemperatureType::Celsius => temp,
TemperatureType::Kelvin => convert_celsius_to_kelvin(temp),
TemperatureType::Fahrenheit => convert_celsius_to_fahrenheit(temp),
},
});
temperatures.push(TempHarvest {
name,
temperature: convert_temp_unit(temp, temp_type),
});
}
}
}
}
}
Ok(temperatures)
}
/// Gets temperature sensors and data.
pub fn get_temperature_data(
temp_type: &TemperatureType, filter: &Option<Filter>,
) -> Result<Option<Vec<TempHarvest>>> {
let mut temperature_vec: Vec<TempHarvest> =
get_from_hwmon(temp_type, filter).unwrap_or_default();
let mut results = hwmon_temperatures(temp_type, filter);
if temperature_vec.is_empty() {
// If it's empty or it fails, try to fall back to checking `thermal_zone*`.
temperature_vec = get_from_thermal_zone(temp_type, filter).unwrap_or_default();
if results.num_hwmon == 0 {
add_thermal_zone_temperatures(&mut results.temperatures, temp_type, filter);
}
#[cfg(feature = "nvidia")]
{
super::nvidia::add_nvidia_data(&mut temperature_vec, temp_type, filter)?;
super::nvidia::add_nvidia_data(&mut results.temperatures, temp_type, filter)?;
}
Ok(Some(temperature_vec))
Ok(Some(results.temperatures))
}
#[cfg(test)]
mod tests {
use hashbrown::HashMap;
use super::finalize_name;
#[test]
fn test_finalize_name() {
let mut seen_names = HashMap::new();
assert_eq!(
finalize_name(
Some("hwmon".to_string()),
Some("sensor".to_string()),
&Some("test".to_string()),
&mut seen_names
),
"hwmon: sensor"
);
assert_eq!(
finalize_name(
Some("hwmon".to_string()),
None,
&Some("test".to_string()),
&mut seen_names
),
"hwmon"
);
assert_eq!(
finalize_name(
None,
Some("sensor".to_string()),
&Some("test".to_string()),
&mut seen_names
),
"test: sensor"
);
assert_eq!(
finalize_name(
Some("hwmon".to_string()),
Some("sensor".to_string()),
&Some("test".to_string()),
&mut seen_names
),
"hwmon: sensor (1)"
);
assert_eq!(
finalize_name(None, None, &Some("test".to_string()), &mut seen_names),
"test"
);
assert_eq!(finalize_name(None, None, &None, &mut seen_names), "Unknown");
assert_eq!(
finalize_name(None, None, &Some("test".to_string()), &mut seen_names),
"test (1)"
);
assert_eq!(
finalize_name(None, None, &None, &mut seen_names),
"Unknown (1)"
);
assert_eq!(
finalize_name(Some(String::default()), None, &None, &mut seen_names),
"Unknown (2)"
);
assert_eq!(
finalize_name(None, Some(String::default()), &None, &mut seen_names),
"Unknown (3)"
);
assert_eq!(
finalize_name(None, None, &Some(String::default()), &mut seen_names),
"Unknown (4)"
);
}
}

View File

@ -6,13 +6,14 @@ use super::{
};
use crate::app::Filter;
use crate::data_harvester::nvidia::NVML_DATA;
use crate::utils::error;
pub fn add_nvidia_data(
temperature_vec: &mut Vec<TempHarvest>, temp_type: &TemperatureType, filter: &Option<Filter>,
) -> crate::utils::error::Result<()> {
) -> error::Result<()> {
if let Ok(nvml) = &*NVML_DATA {
if let Ok(ngpu) = nvml.device_count() {
for i in 0..ngpu {
if let Ok(gpu_num) = nvml.device_count() {
for i in 0..gpu_num {
if let Ok(device) = nvml.device_by_index(i) {
if let (Ok(name), Ok(temperature)) =
(device.name(), device.temperature(TemperatureSensor::Gpu))