fix(processor): recovery process can be ended in time when the watchdog received stop command.

This commit is contained in:
2026-05-08 17:09:04 +08:00
parent 3ce076b8dc
commit 65f31f28f6
3 changed files with 61 additions and 44 deletions

View File

@@ -6,6 +6,7 @@ import (
"time" "time"
"Watchdog_Linux-systemd/postLog" "Watchdog_Linux-systemd/postLog"
"Watchdog_Linux-systemd/global"
) )
type ServiceStatusChecker func(serviceName string) bool type ServiceStatusChecker func(serviceName string) bool
@@ -18,6 +19,7 @@ func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecke
} }
for i := 0; i < 5; i++ { for i := 0; i < 5; i++ {
if global.Monitors[serviceName].Recovery { // Only recovery process is started
serviceControl.RetryCount++ serviceControl.RetryCount++
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount)) postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
cmd := exec.Command("systemctl", "restart", serviceName) cmd := exec.Command("systemctl", "restart", serviceName)
@@ -29,6 +31,7 @@ func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecke
} }
time.Sleep(time.Duration(i+1) * time.Second) time.Sleep(time.Duration(i+1) * time.Second)
} }
}
serviceControl.ErrorType = "restart" serviceControl.ErrorType = "restart"
serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount) serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount)
serviceControl.ErrorTime = time.Now() serviceControl.ErrorTime = time.Now()

17
global/vars.go Normal file
View File

@@ -0,0 +1,17 @@
package global
import (
"sync"
)
type ServiceMonitor struct {
ServiceName string
StopChan chan struct{}
Running bool
Recovery bool
}
var (
Monitors = make(map[string]*ServiceMonitor)
MonitorsMutex sync.RWMutex
)

View File

@@ -3,41 +3,34 @@ package monitor
import ( import (
"Watchdog_Linux-systemd/errorHandle" "Watchdog_Linux-systemd/errorHandle"
"Watchdog_Linux-systemd/postLog" "Watchdog_Linux-systemd/postLog"
"Watchdog_Linux-systemd/global"
"Watchdog_Linux-systemd/socket" "Watchdog_Linux-systemd/socket"
"fmt" "fmt"
"os/exec" "os/exec"
"strings" "strings"
"sync"
"time" "time"
) )
type ServiceMonitor struct {
ServiceName string
StopChan chan struct{}
Running bool
}
var (
monitors = make(map[string]*ServiceMonitor)
monitorsMutex sync.RWMutex
)
func AddServiceMonitor(serviceName string) error { func AddServiceMonitor(serviceName string) error {
monitorsMutex.Lock() global.MonitorsMutex.Lock()
defer monitorsMutex.Unlock() defer global.MonitorsMutex.Unlock()
if _, exists := monitors[serviceName]; exists { if _, exists := global.Monitors[serviceName]; exists {
if global.Monitors[serviceName].Running {
return fmt.Errorf("service monitor '%s' already exists", serviceName) return fmt.Errorf("service monitor '%s' already exists", serviceName)
} }
}
stopChan := make(chan struct{}) stopChan := make(chan struct{})
monitor := &ServiceMonitor{ monitor := &global.ServiceMonitor{
ServiceName: serviceName, ServiceName: serviceName,
StopChan: stopChan, StopChan: stopChan,
Running: true, Running: true,
} }
monitors[serviceName] = monitor global.Monitors[serviceName] = monitor
go runMonitor(monitor) go runMonitor(monitor)
@@ -46,23 +39,24 @@ func AddServiceMonitor(serviceName string) error {
} }
func RemoveServiceMonitor(serviceName string) error { func RemoveServiceMonitor(serviceName string) error {
monitorsMutex.Lock() global.MonitorsMutex.Lock()
defer monitorsMutex.Unlock() defer global.MonitorsMutex.Unlock()
monitor, exists := monitors[serviceName] monitor, exists := global.Monitors[serviceName]
if !exists { if !exists {
return fmt.Errorf("service monitor '%s' not found", serviceName) return fmt.Errorf("service monitor '%s' not found", serviceName)
} }
close(monitor.StopChan) // close(monitor.StopChan)
monitor.Running = false monitor.Running = false
delete(monitors, serviceName) monitor.Recovery = false
// delete(global.Monitors, serviceName)
postLog.Info(fmt.Sprintf("[Monitor] Removed service monitor for: %s", serviceName)) postLog.Info(fmt.Sprintf("[Monitor] Removed service monitor for: %s", serviceName))
return nil return nil
} }
func runMonitor(m *ServiceMonitor) { func runMonitor(m *global.ServiceMonitor) {
postLog.Info(fmt.Sprintf("[Monitor] Started monitoring service: %s", m.ServiceName)) postLog.Info(fmt.Sprintf("[Monitor] Started monitoring service: %s", m.ServiceName))
ticker := time.NewTicker(5 * time.Second) ticker := time.NewTicker(5 * time.Second)
@@ -124,7 +118,7 @@ func checkServiceLogs(serviceName string) (bool, error) {
} }
func IsServiceExist(serviceName string) bool { func IsServiceExist(serviceName string) bool {
_, exists := monitors[serviceName] _, exists := global.Monitors[serviceName]
if !exists { if !exists {
return false return false
} }
@@ -147,14 +141,15 @@ func IsServiceRunning(serviceName string) bool {
func throwException(serviceName, errorContent string) error { func throwException(serviceName, errorContent string) error {
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent)) postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
monitorsMutex.Lock() global.MonitorsMutex.Lock()
monitor, exists := monitors[serviceName] monitor, exists := global.Monitors[serviceName]
if exists { if exists {
close(monitor.StopChan) close(monitor.StopChan)
monitor.Running = false monitor.Running = false
delete(monitors, serviceName) monitor.Recovery = true
// delete(global.Monitors, serviceName)
} }
monitorsMutex.Unlock() global.MonitorsMutex.Unlock()
err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning) err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning)
if err != nil { if err != nil {
@@ -165,15 +160,15 @@ func throwException(serviceName, errorContent string) error {
return nil return nil
} }
monitorsMutex.Lock() global.MonitorsMutex.Lock()
stopChan := make(chan struct{}) stopChan := make(chan struct{})
newMonitor := &ServiceMonitor{ newMonitor := &global.ServiceMonitor{
ServiceName: serviceName, ServiceName: serviceName,
StopChan: stopChan, StopChan: stopChan,
Running: true, Running: true,
} }
monitors[serviceName] = newMonitor global.Monitors[serviceName] = newMonitor
monitorsMutex.Unlock() global.MonitorsMutex.Unlock()
go runMonitor(newMonitor) go runMonitor(newMonitor)
postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName)) postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName))
@@ -182,12 +177,14 @@ func throwException(serviceName, errorContent string) error {
} }
func GetActiveMonitors() []string { func GetActiveMonitors() []string {
monitorsMutex.RLock() global.MonitorsMutex.RLock()
defer monitorsMutex.RUnlock() defer global.MonitorsMutex.RUnlock()
var activeServices []string var activeServices []string
for name := range monitors { for name, monitor := range global.Monitors {
if monitor.Running {
activeServices = append(activeServices, name) activeServices = append(activeServices, name)
} }
}
return activeServices return activeServices
} }