diff --git a/errorHandle/processor.go b/errorHandle/processor.go new file mode 100644 index 0000000..72b4026 --- /dev/null +++ b/errorHandle/processor.go @@ -0,0 +1,36 @@ +package errorHandle + +import ( + "fmt" + "os/exec" + "time" + + "Watchdog_Linux-systemd/postLog" +) + +type ServiceStatusChecker func(serviceName string) bool + +func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecker) error { + postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Start handle error process for service: %s", serviceName)) + serviceControl := &ServiceControl{ + ServiceName: serviceName, + RetryCount: 0, + } + + for i := 0; i < 5; i++ { + serviceControl.RetryCount++ + postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount)) + cmd := exec.Command("systemctl", "restart", serviceName) + err := cmd.Run() + if err == nil { + if isServiceRunning != nil && isServiceRunning(serviceName) { + return nil + } + } + time.Sleep(time.Duration(i+1) * time.Second) + } + serviceControl.ErrorType = "restart" + serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount) + serviceControl.ErrorTime = time.Now() + return fmt.Errorf(serviceControl.ErrorMsg) +} diff --git a/errorHandle/vars.go b/errorHandle/vars.go new file mode 100644 index 0000000..17d24f9 --- /dev/null +++ b/errorHandle/vars.go @@ -0,0 +1,13 @@ +package errorHandle + +import ( + "time" +) + +type ServiceControl struct { + ServiceName string + RetryCount int + ErrorType string + ErrorMsg string + ErrorTime time.Time +} \ No newline at end of file diff --git a/monitor/monitor.go b/monitor/monitor.go index b579755..c14a8cc 100644 --- a/monitor/monitor.go +++ b/monitor/monitor.go @@ -1,6 +1,7 @@ package monitor import ( + "Watchdog_Linux-systemd/errorHandle" "Watchdog_Linux-systemd/postLog" "Watchdog_Linux-systemd/socket" "fmt" @@ -122,12 +123,61 @@ func checkServiceLogs(serviceName string) (bool, error) { return true, nil } +func IsServiceExist(serviceName string) bool { + _, exists := monitors[serviceName] + if !exists { + return false + } + return true +} + +func IsServiceRunning(serviceName string) bool { + status, err := checkServiceStatus(serviceName) + if err != nil || !status { + return false + } + + logStatus, err := checkServiceLogs(serviceName) + if err != nil || !logStatus { + return false + } + return true +} + func throwException(serviceName, errorContent string) error { postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent)) - err := socket.SendMsg(fmt.Sprintf("[Exception] %s %s %s", "service", serviceName, errorContent)) - if err != nil { - return fmt.Errorf("failed to send exception message: %v", err) + + monitorsMutex.Lock() + monitor, exists := monitors[serviceName] + if exists { + close(monitor.StopChan) + monitor.Running = false + delete(monitors, serviceName) } + monitorsMutex.Unlock() + + err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning) + if err != nil { + err := socket.SendMsg(fmt.Sprintf("[Exception] %s %s %s", "service", serviceName, errorContent)) + if err != nil { + return fmt.Errorf("failed to send exception message: %v", err) + } + return nil + } + + monitorsMutex.Lock() + stopChan := make(chan struct{}) + newMonitor := &ServiceMonitor{ + ServiceName: serviceName, + StopChan: stopChan, + Running: true, + } + monitors[serviceName] = newMonitor + monitorsMutex.Unlock() + + go runMonitor(newMonitor) + postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName)) + return nil }