feat(errorHandle): add service error handling and recovery logic

Implement error handling mechanism with retry logic for failed services. Includes:
- ServiceControl struct to track service state
- HandleErrorProcess function to attempt service restarts
- Integration with monitor to automatically recover services
- Enhanced exception handling with status checks and monitoring restart
This commit is contained in:
2026-04-29 12:11:25 +08:00
parent db192f2209
commit 3ce076b8dc
3 changed files with 102 additions and 3 deletions

36
errorHandle/processor.go Normal file
View File

@@ -0,0 +1,36 @@
package errorHandle
import (
"fmt"
"os/exec"
"time"
"Watchdog_Linux-systemd/postLog"
)
type ServiceStatusChecker func(serviceName string) bool
func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecker) error {
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Start handle error process for service: %s", serviceName))
serviceControl := &ServiceControl{
ServiceName: serviceName,
RetryCount: 0,
}
for i := 0; i < 5; i++ {
serviceControl.RetryCount++
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
cmd := exec.Command("systemctl", "restart", serviceName)
err := cmd.Run()
if err == nil {
if isServiceRunning != nil && isServiceRunning(serviceName) {
return nil
}
}
time.Sleep(time.Duration(i+1) * time.Second)
}
serviceControl.ErrorType = "restart"
serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount)
serviceControl.ErrorTime = time.Now()
return fmt.Errorf(serviceControl.ErrorMsg)
}

13
errorHandle/vars.go Normal file
View File

@@ -0,0 +1,13 @@
package errorHandle
import (
"time"
)
type ServiceControl struct {
ServiceName string
RetryCount int
ErrorType string
ErrorMsg string
ErrorTime time.Time
}

View File

@@ -1,6 +1,7 @@
package monitor
import (
"Watchdog_Linux-systemd/errorHandle"
"Watchdog_Linux-systemd/postLog"
"Watchdog_Linux-systemd/socket"
"fmt"
@@ -122,12 +123,61 @@ func checkServiceLogs(serviceName string) (bool, error) {
return true, nil
}
func IsServiceExist(serviceName string) bool {
_, exists := monitors[serviceName]
if !exists {
return false
}
return true
}
func IsServiceRunning(serviceName string) bool {
status, err := checkServiceStatus(serviceName)
if err != nil || !status {
return false
}
logStatus, err := checkServiceLogs(serviceName)
if err != nil || !logStatus {
return false
}
return true
}
func throwException(serviceName, errorContent string) error {
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
err := socket.SendMsg(fmt.Sprintf("[Exception] <exceptionType>%s</exceptionType> <serviceName>%s</serviceName> <errorMsg>%s</errorMsg>", "service", serviceName, errorContent))
if err != nil {
return fmt.Errorf("failed to send exception message: %v", err)
monitorsMutex.Lock()
monitor, exists := monitors[serviceName]
if exists {
close(monitor.StopChan)
monitor.Running = false
delete(monitors, serviceName)
}
monitorsMutex.Unlock()
err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning)
if err != nil {
err := socket.SendMsg(fmt.Sprintf("[Exception] <exceptionType>%s</exceptionType> <serviceName>%s</serviceName> <errorMsg>%s</errorMsg>", "service", serviceName, errorContent))
if err != nil {
return fmt.Errorf("failed to send exception message: %v", err)
}
return nil
}
monitorsMutex.Lock()
stopChan := make(chan struct{})
newMonitor := &ServiceMonitor{
ServiceName: serviceName,
StopChan: stopChan,
Running: true,
}
monitors[serviceName] = newMonitor
monitorsMutex.Unlock()
go runMonitor(newMonitor)
postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName))
return nil
}