mirror of
https://github.com/vale981/phoebe
synced 2025-03-05 09:51:37 -05:00
New module: phoebe.services.monitoring
Basic monitoring and alerting with netdata.
This commit is contained in:
parent
3129ab3fa7
commit
4846f4ccfa
4 changed files with 226 additions and 0 deletions
|
@ -4,6 +4,7 @@
|
|||
imports = [
|
||||
./builder
|
||||
./databases
|
||||
./monitoring
|
||||
./web
|
||||
];
|
||||
}
|
||||
|
|
125
modules/services/monitoring/charts.d/services.chart.sh
Executable file
125
modules/services/monitoring/charts.d/services.chart.sh
Executable file
|
@ -0,0 +1,125 @@
|
|||
#!/bin/bash
|
||||
# Taken from: https://github.com/mo0nsniper/netdata/commit/157b6e04b1931f57f16433fae42e028c525bd5cb
|
||||
# no need for shebang - this file is loaded from charts.d.plugin
|
||||
|
||||
# if this chart is called X.chart.sh, then all functions and global variables
|
||||
# must start with X_
|
||||
|
||||
# _update_every is a special variable - it holds the number of seconds
|
||||
# between the calls of the _update() function
|
||||
services_update_every=2
|
||||
|
||||
# the priority is used to sort the charts on the dashboard
|
||||
# 1 = the first chart
|
||||
services_priority=60000
|
||||
|
||||
# global variables to store our collected data
|
||||
# remember: they need to start with the module name example_
|
||||
declare -a services_service
|
||||
declare -a services_status
|
||||
|
||||
services_running=
|
||||
services_dead=
|
||||
services_exited=
|
||||
services_failed=
|
||||
|
||||
services_get() {
|
||||
# do all the work to collect / calculate the values
|
||||
# for each dimension
|
||||
#
|
||||
# Remember:
|
||||
# 1. KEEP IT SIMPLE AND SHORT
|
||||
# 2. AVOID FORKS (avoid piping commands)
|
||||
# 3. AVOID CALLING TOO MANY EXTERNAL PROGRAMS
|
||||
# 4. USE LOCAL VARIABLES (global variables may overlap with other modules)
|
||||
|
||||
declare -a services_line
|
||||
|
||||
services_service=()
|
||||
services_status=()
|
||||
services_line=()
|
||||
|
||||
services_running=0
|
||||
services_dead=0
|
||||
services_exited=0
|
||||
services_failed=0
|
||||
|
||||
while read -a services_line ; do
|
||||
services_service+=(${services_line%.*})
|
||||
|
||||
case ${services_line[3]} in
|
||||
running) services_status+=("1") ; ((services_running++)) ;;
|
||||
dead) services_status+=("-2"); ((services_dead++)) ;;
|
||||
exited) services_status+=("-3"); ((services_exited++)) ;;
|
||||
failed) services_status+=("-4"); ((services_failed++)) ;;
|
||||
esac
|
||||
done < <(systemctl --no-legend --no-pager --plain --state=loaded --all --type=service )
|
||||
|
||||
# this should return:
|
||||
# - 0 to send the data to netdata
|
||||
# - 1 to report a failure to collect the data
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# _check is called once, to find out if this chart should be enabled or not
|
||||
services_check() {
|
||||
# this should return:
|
||||
# - 0 to enable the chart
|
||||
# - 1 to disable the chart
|
||||
|
||||
# check something
|
||||
require_cmd systemctl || return 1
|
||||
|
||||
# check that we can collect data
|
||||
services_get || return 1
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# _create is called once, to create the charts
|
||||
services_create() {
|
||||
|
||||
cat <<EOF
|
||||
CHART Services.summary '' "Summary: $((services_running + services_dead + services_exited + services_failed)) services" "Total" Summary summary stacked $((services_priority)) $services_update_every
|
||||
DIMENSION running '' $services_running 1 1
|
||||
DIMENSION dead '' $services_dead 1 1
|
||||
DIMENSION exited '' $services_exited 1 1
|
||||
DIMENSION failed '' $services_failed 1 1
|
||||
EOF
|
||||
|
||||
echo "CHART Services.status 'System services' 'Status of systemd services: 1=running -2=dead -3=exited -4=failed' 'Status' Services services line $((services_priority + 1)) $services_update_every"
|
||||
for ((i = 0; i < ${#services_service[@]}; i++)) do
|
||||
echo "DIMENSION ${services_service[$i]} '' absolute 1 1"
|
||||
done
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# _update is called continiously, to collect the values
|
||||
services_update() {
|
||||
# the first argument to this function is the microseconds since last update
|
||||
# pass this parameter to the BEGIN statement (see bellow).
|
||||
|
||||
services_get || return 1
|
||||
|
||||
# write the result of the work.
|
||||
|
||||
cat <<VALUESEOF
|
||||
BEGIN Services.summary $1
|
||||
SET running = $services_running
|
||||
SET dead = $services_dead
|
||||
SET exited = $services_exited
|
||||
SET failed = $services_failed
|
||||
END
|
||||
VALUESEOF
|
||||
|
||||
echo "BEGIN Services.status $1"
|
||||
for ((i = 0; i < ${#services_service[@]}; i++)) do
|
||||
echo "SET ${services_service[$i]} = ${services_status[$i]}"
|
||||
done
|
||||
echo "END"
|
||||
|
||||
|
||||
return 0
|
||||
}
|
83
modules/services/monitoring/default.nix
Normal file
83
modules/services/monitoring/default.nix
Normal file
|
@ -0,0 +1,83 @@
|
|||
# Configure monitoring and reporting services.
|
||||
{ config, lib, pkgs, ...}:
|
||||
|
||||
# Bring in library functions:
|
||||
with lib;
|
||||
|
||||
let
|
||||
cfg = config.phoebe.services.monitoring;
|
||||
|
||||
plugins = import ./plugins.nix { inherit (pkgs) stdenvNoCC netdata; };
|
||||
|
||||
alarmNotifyConf = pkgs.writeText "health_alarm_notify.conf"
|
||||
(optionalString cfg.pushover.enable ''
|
||||
SEND_PUSHOVER=YES
|
||||
PUSHOVER_APP_TOKEN="${cfg.pushover.apiKey}"
|
||||
DEFAULT_RECIPIENT_PUSHOVER="${concatStringsSep "," cfg.pushover.userKeys}"
|
||||
'');
|
||||
|
||||
alarmConf = pkgs.writeText "alarms.conf" ''
|
||||
# Send alarms for systemd services.
|
||||
alarm: failed_service
|
||||
on: Services.status
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: min -1m unaligned
|
||||
every: 1m
|
||||
crit: $this < -3 && $this >= -4
|
||||
info: service is failed
|
||||
to: sysadmin
|
||||
'';
|
||||
in
|
||||
{
|
||||
#### Interface
|
||||
options.phoebe.services.monitoring = {
|
||||
enable = mkEnableOption "Monitoring and Reporting.";
|
||||
|
||||
pushover = {
|
||||
enable = mkEnableOption "Alerts via Pushover.";
|
||||
apiKey = mkOption {
|
||||
type = types.str;
|
||||
example = "1234567890abcdefghijklmnopqrst";
|
||||
description = "Pushover API key for netdata";
|
||||
};
|
||||
userKeys = mkOption {
|
||||
type = types.listOf types.str;
|
||||
example = [ "1234567890abcdefghijklmnopqrst" ];
|
||||
description = "List of user keys.";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
#### Implementation
|
||||
config = mkIf cfg.enable {
|
||||
# Enable systemd accounting:
|
||||
systemd.enableCgroupAccounting = true;
|
||||
|
||||
# Use netdata to collect metrics:
|
||||
services.netdata = {
|
||||
enable = true;
|
||||
extraPluginPaths = [ "${plugins}/plugins.d" ];
|
||||
|
||||
config.global = {
|
||||
"debug log" = "syslog";
|
||||
"access log" = "syslog";
|
||||
"error log" = "syslog";
|
||||
};
|
||||
|
||||
config.plugins = {
|
||||
"phoebe.charts.d.plugin" = "yes";
|
||||
};
|
||||
};
|
||||
|
||||
environment.etc."netdata/health_alarm_notify.conf" = {
|
||||
source = "${alarmNotifyConf}";
|
||||
mode = "0444";
|
||||
};
|
||||
|
||||
environment.etc."netdata/health.d/alarm.conf" = {
|
||||
source = "${alarmConf}";
|
||||
mode = "0444";
|
||||
};
|
||||
};
|
||||
}
|
17
modules/services/monitoring/plugins.nix
Normal file
17
modules/services/monitoring/plugins.nix
Normal file
|
@ -0,0 +1,17 @@
|
|||
{ stdenvNoCC
|
||||
, netdata
|
||||
}:
|
||||
|
||||
stdenvNoCC.mkDerivation {
|
||||
name = "netdata-extra-scripts";
|
||||
phases = [ "installPhase" "fixupPhase" ];
|
||||
|
||||
installPhase = ''
|
||||
mkdir -p $out/plugins.d $out/charts.d
|
||||
install -m 0555 ${netdata}/libexec/netdata/plugins.d/charts.d.plugin $out/plugins.d/phoebe.charts.d.plugin
|
||||
install -m 0555 ${./charts.d/services.chart.sh} $out/charts.d/services.chart.sh
|
||||
|
||||
# Force our copy of charts.d.plugin to use the correct charts.d directory:
|
||||
sed -i "s|^chartsd=.*|chartsd=$out/charts.d|" $out/plugins.d/phoebe.charts.d.plugin
|
||||
'';
|
||||
}
|
Loading…
Add table
Reference in a new issue