diff --git a/README.md b/README.md new file mode 100644 index 0000000..26c3d26 --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# Super-frpc Watchdog for GNU/Linux `systemd` + +## Basic Configuration + +Create a `config.json` file in the project root: + +```json +{ + "debugMode": false +} +``` + +| Field | Description | +| :---: | :---: | +| `debugMode` | Debug mode status | + +## TODO + +- [x] Fix log trace eats too much disk io diff --git a/errorHandle/processor.go b/errorHandle/processor.go index b2bec39..b34ba3d 100644 --- a/errorHandle/processor.go +++ b/errorHandle/processor.go @@ -1,12 +1,13 @@ package errorHandle import ( + "errors" "fmt" "os/exec" "time" - "Watchdog_Linux-systemd/postLog" "Watchdog_Linux-systemd/global" + "Watchdog_Linux-systemd/postLog" ) type ServiceStatusChecker func(serviceName string) bool @@ -35,5 +36,5 @@ func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecke serviceControl.ErrorType = "restart" serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount) serviceControl.ErrorTime = time.Now() - return fmt.Errorf(serviceControl.ErrorMsg) -} \ No newline at end of file + return errors.New(serviceControl.ErrorMsg) +} diff --git a/monitor/monitor.go b/monitor/monitor.go index 7a68d26..5082f6a 100644 --- a/monitor/monitor.go +++ b/monitor/monitor.go @@ -2,8 +2,8 @@ package monitor import ( "Watchdog_Linux-systemd/errorHandle" - "Watchdog_Linux-systemd/postLog" "Watchdog_Linux-systemd/global" + "Watchdog_Linux-systemd/postLog" "Watchdog_Linux-systemd/socket" "fmt" "os/exec" @@ -11,8 +11,6 @@ import ( "time" ) - - func AddServiceMonitor(serviceName string) error { global.MonitorsMutex.Lock() defer global.MonitorsMutex.Unlock() @@ -68,53 +66,74 @@ func runMonitor(m *global.ServiceMonitor) { postLog.Info(fmt.Sprintf("[Monitor] Stopped monitoring service: %s", m.ServiceName)) return case <-ticker.C: - serviceStatus, err := checkServiceStatus(m.ServiceName) + serviceStatus, statusDetail, err := checkServiceHealth(m.ServiceName) if err != nil { throwException(m.ServiceName, fmt.Sprintf("Failed to check service status: %v", err)) continue } - logStatus, err := checkServiceLogs(m.ServiceName) - if err != nil { - throwException(m.ServiceName, fmt.Sprintf("Failed to check service logs: %v", err)) - continue - } - - if !serviceStatus || !logStatus { - throwException(m.ServiceName, fmt.Sprintf("Service check failed - Status: %v, Logs: %v", serviceStatus, logStatus)) + if !serviceStatus { + throwException(m.ServiceName, fmt.Sprintf("Service check failed - %s", statusDetail)) } } } } -func checkServiceStatus(serviceName string) (bool, error) { - cmd := exec.Command("systemctl", "is-active", serviceName) +func checkServiceHealth(serviceName string) (bool, string, error) { + cmd := exec.Command( + "systemctl", + "show", + serviceName, + "--property=ActiveState", + "--property=SubState", + "--property=Result", + "--property=ExecMainStatus", + "--property=ExecMainCode", + "--property=NRestarts", + ) output, err := cmd.Output() if err != nil { - return false, fmt.Errorf("failed to execute systemctl: %w", err) + return false, "", fmt.Errorf("failed to execute systemctl show: %w", err) } - status := strings.TrimSpace(string(output)) - return status == "active", nil -} - -func checkServiceLogs(serviceName string) (bool, error) { - cmd := exec.Command("journalctl", "-u", serviceName, "-n", "50", "--no-pager") - output, err := cmd.Output() - if err != nil { - return false, fmt.Errorf("failed to execute journalctl: %w", err) - } - - logContent := string(output) - errorKeywords := []string{"error", "fatal", "failed", "critical", "exception"} - - for _, keyword := range errorKeywords { - if strings.Contains(strings.ToLower(logContent), keyword) { - return false, nil + properties := make(map[string]string) + for _, line := range strings.Split(strings.TrimSpace(string(output)), "\n") { + key, value, found := strings.Cut(line, "=") + if !found { + continue } + properties[strings.TrimSpace(key)] = strings.TrimSpace(value) } - return true, nil + if len(properties) == 0 { + return false, "", fmt.Errorf("unexpected systemctl show output: %q", strings.TrimSpace(string(output))) + } + + activeState := properties["ActiveState"] + subState := properties["SubState"] + result := properties["Result"] + execMainStatus := properties["ExecMainStatus"] + execMainCode := properties["ExecMainCode"] + nRestarts := properties["NRestarts"] + + detail := fmt.Sprintf("ActiveState=%s, SubState=%s, Result=%s, ExecMainStatus=%s, ExecMainCode=%s", activeState, subState, result, execMainStatus, execMainCode) + if nRestarts != "" { + detail = fmt.Sprintf("%s, NRestarts=%s", detail, nRestarts) + } + + if activeState != "active" { + return false, detail, nil + } + + if result != "" && result != "success" { + return false, detail, nil + } + + if execMainStatus != "" && execMainStatus != "0" { + return false, detail, nil + } + + return true, detail, nil } func IsServiceExist(serviceName string) bool { @@ -126,15 +145,10 @@ func IsServiceExist(serviceName string) bool { } func IsServiceRunning(serviceName string) bool { - status, err := checkServiceStatus(serviceName) + status, _, err := checkServiceHealth(serviceName) if err != nil || !status { return false } - - logStatus, err := checkServiceLogs(serviceName) - if err != nil || !logStatus { - return false - } return true }