feat(errorHandle): add service error handling and recovery logic
Implement error handling mechanism with retry logic for failed services. Includes: - ServiceControl struct to track service state - HandleErrorProcess function to attempt service restarts - Integration with monitor to automatically recover services - Enhanced exception handling with status checks and monitoring restart
This commit is contained in:
36
errorHandle/processor.go
Normal file
36
errorHandle/processor.go
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
package errorHandle
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"Watchdog_Linux-systemd/postLog"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ServiceStatusChecker func(serviceName string) bool
|
||||||
|
|
||||||
|
func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecker) error {
|
||||||
|
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Start handle error process for service: %s", serviceName))
|
||||||
|
serviceControl := &ServiceControl{
|
||||||
|
ServiceName: serviceName,
|
||||||
|
RetryCount: 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < 5; i++ {
|
||||||
|
serviceControl.RetryCount++
|
||||||
|
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
|
||||||
|
cmd := exec.Command("systemctl", "restart", serviceName)
|
||||||
|
err := cmd.Run()
|
||||||
|
if err == nil {
|
||||||
|
if isServiceRunning != nil && isServiceRunning(serviceName) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
time.Sleep(time.Duration(i+1) * time.Second)
|
||||||
|
}
|
||||||
|
serviceControl.ErrorType = "restart"
|
||||||
|
serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount)
|
||||||
|
serviceControl.ErrorTime = time.Now()
|
||||||
|
return fmt.Errorf(serviceControl.ErrorMsg)
|
||||||
|
}
|
||||||
13
errorHandle/vars.go
Normal file
13
errorHandle/vars.go
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
package errorHandle
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ServiceControl struct {
|
||||||
|
ServiceName string
|
||||||
|
RetryCount int
|
||||||
|
ErrorType string
|
||||||
|
ErrorMsg string
|
||||||
|
ErrorTime time.Time
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
package monitor
|
package monitor
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"Watchdog_Linux-systemd/errorHandle"
|
||||||
"Watchdog_Linux-systemd/postLog"
|
"Watchdog_Linux-systemd/postLog"
|
||||||
"Watchdog_Linux-systemd/socket"
|
"Watchdog_Linux-systemd/socket"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -122,8 +123,41 @@ func checkServiceLogs(serviceName string) (bool, error) {
|
|||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func IsServiceExist(serviceName string) bool {
|
||||||
|
_, exists := monitors[serviceName]
|
||||||
|
if !exists {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsServiceRunning(serviceName string) bool {
|
||||||
|
status, err := checkServiceStatus(serviceName)
|
||||||
|
if err != nil || !status {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
logStatus, err := checkServiceLogs(serviceName)
|
||||||
|
if err != nil || !logStatus {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func throwException(serviceName, errorContent string) error {
|
func throwException(serviceName, errorContent string) error {
|
||||||
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
|
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
|
||||||
|
|
||||||
|
monitorsMutex.Lock()
|
||||||
|
monitor, exists := monitors[serviceName]
|
||||||
|
if exists {
|
||||||
|
close(monitor.StopChan)
|
||||||
|
monitor.Running = false
|
||||||
|
delete(monitors, serviceName)
|
||||||
|
}
|
||||||
|
monitorsMutex.Unlock()
|
||||||
|
|
||||||
|
err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning)
|
||||||
|
if err != nil {
|
||||||
err := socket.SendMsg(fmt.Sprintf("[Exception] <exceptionType>%s</exceptionType> <serviceName>%s</serviceName> <errorMsg>%s</errorMsg>", "service", serviceName, errorContent))
|
err := socket.SendMsg(fmt.Sprintf("[Exception] <exceptionType>%s</exceptionType> <serviceName>%s</serviceName> <errorMsg>%s</errorMsg>", "service", serviceName, errorContent))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to send exception message: %v", err)
|
return fmt.Errorf("failed to send exception message: %v", err)
|
||||||
@@ -131,6 +165,22 @@ func throwException(serviceName, errorContent string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
monitorsMutex.Lock()
|
||||||
|
stopChan := make(chan struct{})
|
||||||
|
newMonitor := &ServiceMonitor{
|
||||||
|
ServiceName: serviceName,
|
||||||
|
StopChan: stopChan,
|
||||||
|
Running: true,
|
||||||
|
}
|
||||||
|
monitors[serviceName] = newMonitor
|
||||||
|
monitorsMutex.Unlock()
|
||||||
|
|
||||||
|
go runMonitor(newMonitor)
|
||||||
|
postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName))
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func GetActiveMonitors() []string {
|
func GetActiveMonitors() []string {
|
||||||
monitorsMutex.RLock()
|
monitorsMutex.RLock()
|
||||||
defer monitorsMutex.RUnlock()
|
defer monitorsMutex.RUnlock()
|
||||||
|
|||||||
Reference in New Issue
Block a user