Results
developed npu chip monitoring tool by shell script, similar to nvidia-smi extracting value from file (/sys/class/xxx) # for each device, print status
found=$(find /sys/class/xxx/ -iname "xxx[0-9]");
#declare -p found
readarray -td' ' controls < <(echo -n $found)
date=$(date '+%Y-%m-%d %H:%M:%S')
echo $date
printf "+-----------------------------------------------+\n"
printf "| xxx ID\txxx NAME / BOARD NAME\t\t|\n"
printf "| Fan\tTemp\tPwr:Usage/Cap\t\t\t|\n"
printf "|===============================================|\n"
for control_name in "${controls[@]}"
do
string=$(cat "${control_name}/xxx_status_monitor")
declare -ai status
readarray -td ' ' status < <(echo -n $string);
id=$((${control_name//[!0-9]/}))
#status[0] = watt
#status[1] = prod_id
#status[2] = tmu
#status[3] = temp0
#status[4] = temp1
power=$(echo ${status[0]}*0.025 | bc)
power=${power%.*}
printf "| %-2d\t\t%-20s\t\t|\n" $id "$PROJECT_NAME / $BOARD_NAME"
printf "| N/A\t%02dC\t%3dW / %3dW\t\t\t|\n" ${status[3]} ${power} $CAP_POWER
printf "+-----------------------------------------------+\n"
done
printf "+-----------------------------------------------+\n"
printf "| Processes:\t\t\t\t\t|\n"
printf "| xxx ID\tPID\tProcess Name\t\t|\n"
printf "|===============================================|\n"
for control_name in "${controls[@]}"
do
string=$(cat "${control_name}/xxx_status_monitor")
declare -ai status
readarray -td ' ' status < <(echo -n $string);
pid=$((status[5]))
id=$((${control_name//[!0-9]/}))
if [ ${pid} -eq 0 ]; then
continue
fi
string=$(ps -p ${pid} -o comm)
readarray -td ' ' names < <(echo -n $string)
#names[0] = "COMM"
#names[1] = "process name"
printf "| %-2d\t\t%-5d\t%-10s\t\t|\n" $id $pid ${names[1]}
printf "+-----------------------------------------------+\n"
done
(base) kade/npu-exporter/exporter$ go run cmd/npu-exporter/main.go
2024/07/11 13:58:42 Beginning to serve on port 9400
Collected metrics for NPU xxx0:
Power usage: 4.98 W
Temperature 0: 43.50 °C
Temperature 1: 48.00 °C
Process ID: 1036902
Collected metrics for NPU xxx0:
Power usage: 4.95 W
Temperature 0: 43.50 °C
Temperature 1: 48.00 °C
Process ID: 1036902