From 9560722318569c70945843f85535ddf6f368f953 Mon Sep 17 00:00:00 2001 From: Clement Tsang <34804052+ClementTsang@users.noreply.github.com> Date: Sun, 11 Jun 2023 22:12:15 +0000 Subject: [PATCH] feature: on Linux, check coretemp and don't fail fast with name generation when gathering temps (#1188) * feature: also check coretemp on Linux when gathering temps * update changelog * add comment * add logic to check thermal zone if there are no hwmon entries * cleanup * handle duplicates between hwmon and thermal zone * Revert "handle duplicates between hwmon and thermal zone" This reverts commit 402606cc62f27ba196ef2ade6a669ae84aedf0e0. * cleanup * back to running thermal zone if no hwmon * prevent failure cases * temp logging * name refactoring * port dupe name logic to hwmon * Cleanup, add fallback to sensor name * more tests * Fix log * update changelog * cleanup and more tests --- CHANGELOG.md | 2 + src/app/data_harvester/temperature/linux.rs | 544 ++++++++++++------- src/app/data_harvester/temperature/nvidia.rs | 7 +- 3 files changed, 368 insertions(+), 185 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c481da45..d9a9b95f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,11 +12,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#1186](https://github.com/ClementTsang/bottom/pull/1186): Fix for temperature sensor data gathering on Linux immediately halting if any method failed. - [#1191](https://github.com/ClementTsang/bottom/pull/1191): Fix ntfs3 mounts not being counted as a physical drive type. - [#1195](https://github.com/ClementTsang/bottom/pull/1195): Fix battery health being incorrectly reported on M1 macOS. +- [#1188](https://github.com/ClementTsang/bottom/pull/1188): Don't fail fast with temperature sensor name generation on Linux. ## Features - [#1172](https://github.com/ClementTsang/bottom/pull/1172): Support human times for `time_delta` and `default_time_value`. - [#1187](https://github.com/ClementTsang/bottom/pull/1187): Use better names for duplicate temp sensors found by `/sys/class/thermal`. +- [#1188](https://github.com/ClementTsang/bottom/pull/1188): Also check `/sys/devices/platform/coretemp.*` for temp sensors. ## Other diff --git a/src/app/data_harvester/temperature/linux.rs b/src/app/data_harvester/temperature/linux.rs index a626b54b..328b0761 100644 --- a/src/app/data_harvester/temperature/linux.rs +++ b/src/app/data_harvester/temperature/linux.rs @@ -1,9 +1,12 @@ //! Gets temperature sensor data for Linux platforms. -use std::{fs, path::Path}; +use std::{ + fs, + path::{Path, PathBuf}, +}; -use anyhow::{anyhow, Result}; -use hashbrown::HashMap; +use anyhow::Result; +use hashbrown::{HashMap, HashSet}; use super::{is_temp_filtered, TempHarvest, TemperatureType}; use crate::app::{ @@ -11,14 +14,153 @@ use crate::app::{ Filter, }; -/// Get temperature sensors from the linux sysfs interface `/sys/class/hwmon`. +const EMPTY_NAME: &str = "Unknown"; + +/// Returned results from grabbing hwmon/coretemp temperature sensor values/names. +struct HwmonResults { + temperatures: Vec, + num_hwmon: usize, +} + +/// Parses and reads temperatures that were in millidegree Celsius, and if successful, returns a temperature in Celsius. +fn read_temp(path: &Path) -> Result { + Ok(fs::read_to_string(path)? + .trim_end() + .parse::() + .map_err(|e| crate::utils::error::BottomError::ConversionError(e.to_string()))? + / 1_000.0) +} + +fn convert_temp_unit(temp: f32, temp_type: &TemperatureType) -> f32 { + match temp_type { + TemperatureType::Celsius => temp, + TemperatureType::Kelvin => convert_celsius_to_kelvin(temp), + TemperatureType::Fahrenheit => convert_celsius_to_fahrenheit(temp), + } +} + +/// Get all candidates from hwmon and coretemp. It will also return the number of entries from hwmon. +fn get_hwmon_candidates() -> (HashSet, usize) { + let mut dirs = HashSet::default(); + + if let Ok(read_dir) = Path::new("/sys/class/hwmon").read_dir() { + for entry in read_dir.flatten() { + let mut path = entry.path(); + + // hwmon includes many sensors, we only want ones with at least one temperature sensor + // Reading this file will wake the device, but we're only checking existence, so it should be fine. + if !path.join("temp1_input").exists() { + // Note we also check for a `device` subdirectory (e.g. `/sys/class/hwmon/hwmon*/device/`). + // This is needed for CentOS, which adds this extra `/device` directory. See: + // - https://github.com/nicolargo/glances/issues/1060 + // - https://github.com/giampaolo/psutil/issues/971 + // - https://github.com/giampaolo/psutil/blob/642438375e685403b4cd60b0c0e25b80dd5a813d/psutil/_pslinux.py#L1316 + // + // If it does match, then add the `device/` directory to the path. + if path.join("device/temp1_input").exists() { + path.push("device"); + } + } + + dirs.insert(path); + } + } + + let num_hwmon = dirs.len(); + + if let Ok(read_dir) = Path::new("/sys/devices/platform").read_dir() { + for entry in read_dir.flatten() { + if entry.file_name().to_string_lossy().starts_with("coretemp.") { + if let Ok(read_dir) = entry.path().join("hwmon").read_dir() { + for entry in read_dir.flatten() { + let path = entry.path(); + + if path.join("temp1_input").exists() { + // It's possible that there are dupes (represented by symlinks) - the easy + // way is to just substitute the parent directory and check if the hwmon + // variant exists already in a set. + // + // For more info, see https://github.com/giampaolo/psutil/pull/1822/files + if let Some(child) = path.file_name() { + let to_check_path = Path::new("/sys/class/hwmon").join(child); + + if !dirs.contains(&to_check_path) { + dirs.insert(path); + } + } + } + } + } + } + } + } + + (dirs, num_hwmon) +} + +#[inline] +fn read_to_string_lossy>(path: P) -> Option { + fs::read(path) + .map(|v| String::from_utf8_lossy(&v).trim().to_string()) + .ok() +} + +#[inline] +fn humanize_name(name: String, sensor_name: Option<&String>) -> String { + match sensor_name { + Some(ty) => format!("{name} ({ty})"), + None => name, + } +} + +#[inline] +fn counted_name(seen_names: &mut HashMap, name: String) -> String { + let name = if name.is_empty() { + EMPTY_NAME.to_string() + } else { + name + }; + + if let Some(count) = seen_names.get_mut(&name) { + *count += 1; + format!("{name} ({})", *count) + } else { + seen_names.insert(name.clone(), 0); + name + } +} + +#[inline] +fn finalize_name( + hwmon_name: Option, sensor_label: Option, + fallback_sensor_name: &Option, seen_names: &mut HashMap, +) -> String { + let candidate_name = match (hwmon_name, sensor_label) { + (Some(name), Some(label)) => format!("{name}: {label}"), + (None, Some(label)) => match &fallback_sensor_name { + Some(sensor_name) => format!("{sensor_name}: {label}"), + None => label, + }, + (Some(name), None) => name, + (None, None) => match &fallback_sensor_name { + Some(sensor_name) => sensor_name.clone(), + None => EMPTY_NAME.to_string(), + }, + }; + + counted_name(seen_names, candidate_name) +} + +/// Get temperature sensors from the linux sysfs interface `/sys/class/hwmon` and +/// `/sys/devices/platform/coretemp.*`. It returns all found temperature sensors, and the number +/// of checked hwmon directories (not coretemp directories). /// -/// See [the Linux kernel documentation](https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-hwmon) -/// for more details. +/// For more details, see the relevant Linux kernel documentation: +/// - [`/sys/class/hwmon`](https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-hwmon) +/// - [`/sys/devices/platform/coretemp.*`](https://www.kernel.org/doc/html/v5.14/hwmon/coretemp.html) /// /// This method will return `0` as the temperature for devices, such as GPUs, -/// that support power management features and power themselves off. -/// +/// that support power management features that have powered themselves off. /// Specifically, in laptops with iGPUs and dGPUs, if the dGPU is capable of /// entering ACPI D3cold, reading the temperature sensors will wake it, /// and keep it awake, wasting power. @@ -27,11 +169,11 @@ use crate::app::{ /// the device is already in ACPI D0. This has the notable issue that /// once this happens, the device will be *kept* on through the sensor /// reading, and not be able to re-enter ACPI D3cold. -fn get_from_hwmon( - temp_type: &TemperatureType, filter: &Option, -) -> Result> { - let mut temperature_vec: Vec = vec![]; - let path = Path::new("/sys/class/hwmon"); +fn hwmon_temperatures(temp_type: &TemperatureType, filter: &Option) -> HwmonResults { + let mut temperatures: Vec = vec![]; + let mut seen_names: HashMap = HashMap::new(); + + let (dirs, num_hwmon) = get_hwmon_candidates(); // Note that none of this is async if we ever go back to it, but sysfs is in // memory, so in theory none of this should block if we're slightly careful. @@ -43,230 +185,268 @@ fn get_from_hwmon( // will not wake the device, and thus not block, // and meaning no sensors have to be hidden depending on `power_state` // - // It would probably be more ideal to use a proper async runtime.. - for entry in path.read_dir()? { - let file = entry?; - let mut file_path = file.path(); + // It would probably be more ideal to use a proper async runtime; this would also allow easy cancellation/timeouts. + for file_path in dirs { + let sensor_name = read_to_string_lossy(file_path.join("name")); - // hwmon includes many sensors, we only want ones with at least one temperature sensor - // Reading this file will wake the device, but we're only checking existence. - if !file_path.join("temp1_input").exists() { - // Note we also check for a `device` subdirectory (e.g. `/sys/class/hwmon/hwmon*/device/`). - // This is needed for CentOS, which adds this extra `/device` directory. See: - // - https://github.com/nicolargo/glances/issues/1060 - // - https://github.com/giampaolo/psutil/issues/971 - // - https://github.com/giampaolo/psutil/blob/642438375e685403b4cd60b0c0e25b80dd5a813d/psutil/_pslinux.py#L1316 - // - // If it does match, then add the `device/` directory to the path. - if file_path.join("device/temp1_input").exists() { - file_path.push("device"); - } else { - continue; - } - } - - let hwmon_name = file_path.join("name"); - let hwmon_name = Some(fs::read_to_string(hwmon_name)?); - - // Whether the temperature should *actually* be read during enumeration + // Whether the temperature should *actually* be read during enumeration. // Set to false if the device is in ACPI D3cold. + // + // If it is false, then the temperature will be set to 0.0 later down the line. let should_read_temp = { // Documented at https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-power_state let device = file_path.join("device"); let power_state = device.join("power_state"); if power_state.exists() { - let state = fs::read_to_string(power_state)?; - let state = state.trim(); - // The zenpower3 kernel module (incorrectly?) reports "unknown" - // causing this check to fail and temperatures to appear as zero - // instead of having the file not exist.. - // their self-hosted git instance has disabled sign up, - // so this bug cant be reported either. - state == "D0" || state == "unknown" + if let Ok(state) = fs::read_to_string(power_state) { + let state = state.trim(); + // The zenpower3 kernel module (incorrectly?) reports "unknown", causing this check + // to fail and temperatures to appear as zero instead of having the file not exist. + // + // Their self-hosted git instance has disabled sign up, so this bug cant be reported either. + state == "D0" || state == "unknown" + } else { + true + } } else { true } }; - // Enumerate the devices temperature sensors - for entry in file_path.read_dir()? { - let file = entry?; - let name = file.file_name(); - // This should always be ASCII - let name = name - .to_str() - .ok_or_else(|| anyhow!("temperature device filenames should be ASCII"))?; - // We only want temperature sensors, skip others early - if !(name.starts_with("temp") && name.ends_with("input")) { - continue; - } - let temp = file.path(); - let temp_label = file_path.join(name.replace("input", "label")); - let temp_label = fs::read_to_string(temp_label).ok(); + if let Ok(dir_entries) = file_path.read_dir() { + // Enumerate the devices temperature sensors + for file in dir_entries.flatten() { + let name = file.file_name(); + let name = name.to_string_lossy(); - // Do some messing around to get a more sensible name for sensors - // - // - For GPUs, this will use the kernel device name, ex `card0` - // - For nvme drives, this will also use the kernel name, ex `nvme0`. - // This is found differently than for GPUs - // - For whatever acpitz is, on my machine this is now `thermal_zone0`. - // - For k10temp, this will still be k10temp, but it has to be handled special. - let human_hwmon_name = { - let device = file_path.join("device"); - // This will exist for GPUs but not others, this is how - // we find their kernel name - let drm = device.join("drm"); - if drm.exists() { - // This should never actually be empty - let mut gpu = None; - for card in drm.read_dir()? { - let card = card?; - let name = card.file_name().to_str().unwrap_or_default().to_owned(); - if name.starts_with("card") { - if let Some(hwmon_name) = hwmon_name.as_ref() { - gpu = Some(format!("{} ({})", name, hwmon_name.trim())); - } else { - gpu = Some(name) - } - break; - } - } - gpu - } else { - // This little mess is to account for stuff like k10temp - // This is needed because the `device` symlink - // points to `nvme*` for nvme drives, but to PCI buses for anything else - // If the first character is alphabetic, - // its an actual name like k10temp or nvme0, not a PCI bus - let link = fs::read_link(device)? - .file_name() - .map(|f| f.to_str().unwrap_or_default().to_owned()) - .unwrap(); - if link.as_bytes()[0].is_ascii_alphabetic() { - if let Some(hwmon_name) = hwmon_name.as_ref() { - Some(format!("{} ({})", link, hwmon_name.trim())) - } else { - Some(link) - } - } else { - hwmon_name.clone() - } + // We only want temperature sensors, skip others early + if !(name.starts_with("temp") && name.ends_with("input")) { + continue; } - }; - let name = match (&human_hwmon_name, &temp_label) { - (Some(name), Some(label)) => format!("{}: {}", name.trim(), label.trim()), - (None, Some(label)) => label.to_string(), - (Some(name), None) => name.to_string(), - (None, None) => String::default(), - }; + let temp_path = file.path(); + let sensor_label_path = file_path.join(name.replace("input", "label")); + let sensor_label = read_to_string_lossy(sensor_label_path); - if is_temp_filtered(filter, &name) { - let temp = if should_read_temp { - if let Ok(temp) = fs::read_to_string(temp) { - let temp = temp.trim_end().parse::().map_err(|e| { - crate::utils::error::BottomError::ConversionError(e.to_string()) - })?; - temp / 1_000.0 + // Do some messing around to get a more sensible name for sensors: + // - For GPUs, this will use the kernel device name, ex `card0` + // - For nvme drives, this will also use the kernel name, ex `nvme0`. + // This is found differently than for GPUs + // - For whatever acpitz is, on my machine this is now `thermal_zone0`. + // - For k10temp, this will still be k10temp, but it has to be handled special. + let hwmon_name = { + let device = file_path.join("device"); + + // This will exist for GPUs but not others, this is how we find their kernel name. + let drm = device.join("drm"); + if drm.exists() { + // This should never actually be empty. If it is though, we'll fall back to the sensor name + // later on. + let mut gpu = None; + + if let Ok(cards) = drm.read_dir() { + for card in cards.flatten() { + if let Some(name) = card.file_name().to_str() { + if name.starts_with("card") { + gpu = Some(humanize_name( + name.trim().to_string(), + sensor_name.as_ref(), + )); + break; + } + } + } + } + + gpu } else { - // For some devices (e.g. iwlwifi), this file becomes empty when the device - // is disabled. In this case we skip the device. - continue; + // This little mess is to account for stuff like k10temp. This is needed because the + // `device` symlink points to `nvme*` for nvme drives, but to PCI buses for anything + // else. If the first character is alphabetic, it's an actual name like k10temp or + // nvme0, not a PCI bus. + fs::read_link(device).ok().and_then(|link| { + let link = link + .file_name() + .and_then(|f| f.to_str()) + .map(|s| s.trim().to_owned()); + + match link { + Some(link) if link.as_bytes()[0].is_ascii_alphabetic() => { + Some(humanize_name(link, sensor_name.as_ref())) + } + _ => None, + } + }) } - } else { - 0.0 }; - temperature_vec.push(TempHarvest { - name, - temperature: match temp_type { - TemperatureType::Celsius => temp, - TemperatureType::Kelvin => convert_celsius_to_kelvin(temp), - TemperatureType::Fahrenheit => convert_celsius_to_fahrenheit(temp), - }, - }); + let name = finalize_name(hwmon_name, sensor_label, &sensor_name, &mut seen_names); + + if is_temp_filtered(filter, &name) { + let temp = if should_read_temp { + if let Ok(temp) = read_temp(&temp_path) { + temp + } else { + continue; + } + } else { + 0.0 + }; + + temperatures.push(TempHarvest { + name, + temperature: convert_temp_unit(temp, temp_type), + }); + } } } } - Ok(temperature_vec) + HwmonResults { + temperatures, + num_hwmon, + } } /// Gets data from `/sys/class/thermal/thermal_zone*`. This should only be used if -/// [`get_from_hwmon`] doesn't return anything. +/// [`hwmon_temperatures`] doesn't return anything. /// /// See [the Linux kernel documentation](https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-thermal) /// for more details. -fn get_from_thermal_zone( - temp_type: &TemperatureType, filter: &Option, -) -> Result> { - let mut temperatures = vec![]; +fn add_thermal_zone_temperatures( + temperatures: &mut Vec, temp_type: &TemperatureType, filter: &Option, +) { let path = Path::new("/sys/class/thermal"); + let Ok(read_dir) = path.read_dir() else { + return + }; let mut seen_names: HashMap = HashMap::new(); - for entry in path.read_dir()? { - let file = entry?; - if file + for entry in read_dir.flatten() { + if entry .file_name() .to_string_lossy() .starts_with("thermal_zone") { - let file_path = file.path(); + let file_path = entry.path(); let name_path = file_path.join("type"); - let name = fs::read_to_string(name_path)?; - let name = name.trim_end(); + if let Some(name) = read_to_string_lossy(name_path) { + if is_temp_filtered(filter, &name) { + let temp_path = file_path.join("temp"); + if let Ok(temp) = read_temp(&temp_path) { + let name = counted_name(&mut seen_names, name); - if is_temp_filtered(filter, name) { - let temp_path = file_path.join("temp"); - let temp = fs::read_to_string(temp_path)? - .trim_end() - .parse::() - .map_err(|e| { - crate::utils::error::BottomError::ConversionError(e.to_string()) - })? - / 1_000.0; - - let name = if let Some(count) = seen_names.get_mut(name) { - *count += 1; - format!("{name} ({})", *count) - } else { - seen_names.insert(name.to_string(), 0); - name.to_string() - }; - - temperatures.push(TempHarvest { - name, - temperature: match temp_type { - TemperatureType::Celsius => temp, - TemperatureType::Kelvin => convert_celsius_to_kelvin(temp), - TemperatureType::Fahrenheit => convert_celsius_to_fahrenheit(temp), - }, - }); + temperatures.push(TempHarvest { + name, + temperature: convert_temp_unit(temp, temp_type), + }); + } + } } } } - - Ok(temperatures) } /// Gets temperature sensors and data. pub fn get_temperature_data( temp_type: &TemperatureType, filter: &Option, ) -> Result>> { - let mut temperature_vec: Vec = - get_from_hwmon(temp_type, filter).unwrap_or_default(); + let mut results = hwmon_temperatures(temp_type, filter); - if temperature_vec.is_empty() { - // If it's empty or it fails, try to fall back to checking `thermal_zone*`. - temperature_vec = get_from_thermal_zone(temp_type, filter).unwrap_or_default(); + if results.num_hwmon == 0 { + add_thermal_zone_temperatures(&mut results.temperatures, temp_type, filter); } #[cfg(feature = "nvidia")] { - super::nvidia::add_nvidia_data(&mut temperature_vec, temp_type, filter)?; + super::nvidia::add_nvidia_data(&mut results.temperatures, temp_type, filter)?; } - Ok(Some(temperature_vec)) + Ok(Some(results.temperatures)) +} + +#[cfg(test)] +mod tests { + use hashbrown::HashMap; + + use super::finalize_name; + + #[test] + fn test_finalize_name() { + let mut seen_names = HashMap::new(); + + assert_eq!( + finalize_name( + Some("hwmon".to_string()), + Some("sensor".to_string()), + &Some("test".to_string()), + &mut seen_names + ), + "hwmon: sensor" + ); + + assert_eq!( + finalize_name( + Some("hwmon".to_string()), + None, + &Some("test".to_string()), + &mut seen_names + ), + "hwmon" + ); + + assert_eq!( + finalize_name( + None, + Some("sensor".to_string()), + &Some("test".to_string()), + &mut seen_names + ), + "test: sensor" + ); + + assert_eq!( + finalize_name( + Some("hwmon".to_string()), + Some("sensor".to_string()), + &Some("test".to_string()), + &mut seen_names + ), + "hwmon: sensor (1)" + ); + + assert_eq!( + finalize_name(None, None, &Some("test".to_string()), &mut seen_names), + "test" + ); + + assert_eq!(finalize_name(None, None, &None, &mut seen_names), "Unknown"); + + assert_eq!( + finalize_name(None, None, &Some("test".to_string()), &mut seen_names), + "test (1)" + ); + + assert_eq!( + finalize_name(None, None, &None, &mut seen_names), + "Unknown (1)" + ); + + assert_eq!( + finalize_name(Some(String::default()), None, &None, &mut seen_names), + "Unknown (2)" + ); + + assert_eq!( + finalize_name(None, Some(String::default()), &None, &mut seen_names), + "Unknown (3)" + ); + + assert_eq!( + finalize_name(None, None, &Some(String::default()), &mut seen_names), + "Unknown (4)" + ); + } } diff --git a/src/app/data_harvester/temperature/nvidia.rs b/src/app/data_harvester/temperature/nvidia.rs index be51e45b..00368157 100644 --- a/src/app/data_harvester/temperature/nvidia.rs +++ b/src/app/data_harvester/temperature/nvidia.rs @@ -6,13 +6,14 @@ use super::{ }; use crate::app::Filter; use crate::data_harvester::nvidia::NVML_DATA; +use crate::utils::error; pub fn add_nvidia_data( temperature_vec: &mut Vec, temp_type: &TemperatureType, filter: &Option, -) -> crate::utils::error::Result<()> { +) -> error::Result<()> { if let Ok(nvml) = &*NVML_DATA { - if let Ok(ngpu) = nvml.device_count() { - for i in 0..ngpu { + if let Ok(gpu_num) = nvml.device_count() { + for i in 0..gpu_num { if let Ok(device) = nvml.device_by_index(i) { if let (Ok(name), Ok(temperature)) = (device.name(), device.temperature(TemperatureSensor::Gpu))