fix(processor): recovery process can be ended in time when the watchdog received stop command.

This commit is contained in:
2026-05-08 17:09:04 +08:00
parent 3ce076b8dc
commit 65f31f28f6
3 changed files with 61 additions and 44 deletions

View File

@@ -6,6 +6,7 @@ import (
"time"
"Watchdog_Linux-systemd/postLog"
"Watchdog_Linux-systemd/global"
)
type ServiceStatusChecker func(serviceName string) bool
@@ -18,16 +19,18 @@ func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecke
}
for i := 0; i < 5; i++ {
serviceControl.RetryCount++
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
cmd := exec.Command("systemctl", "restart", serviceName)
err := cmd.Run()
if err == nil {
if isServiceRunning != nil && isServiceRunning(serviceName) {
return nil
if global.Monitors[serviceName].Recovery { // Only recovery process is started
serviceControl.RetryCount++
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
cmd := exec.Command("systemctl", "restart", serviceName)
err := cmd.Run()
if err == nil {
if isServiceRunning != nil && isServiceRunning(serviceName) {
return nil
}
}
time.Sleep(time.Duration(i+1) * time.Second)
}
time.Sleep(time.Duration(i+1) * time.Second)
}
serviceControl.ErrorType = "restart"
serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount)

17
global/vars.go Normal file
View File

@@ -0,0 +1,17 @@
package global
import (
"sync"
)
type ServiceMonitor struct {
ServiceName string
StopChan chan struct{}
Running bool
Recovery bool
}
var (
Monitors = make(map[string]*ServiceMonitor)
MonitorsMutex sync.RWMutex
)

View File

@@ -3,41 +3,34 @@ package monitor
import (
"Watchdog_Linux-systemd/errorHandle"
"Watchdog_Linux-systemd/postLog"
"Watchdog_Linux-systemd/global"
"Watchdog_Linux-systemd/socket"
"fmt"
"os/exec"
"strings"
"sync"
"time"
)
type ServiceMonitor struct {
ServiceName string
StopChan chan struct{}
Running bool
}
var (
monitors = make(map[string]*ServiceMonitor)
monitorsMutex sync.RWMutex
)
func AddServiceMonitor(serviceName string) error {
monitorsMutex.Lock()
defer monitorsMutex.Unlock()
global.MonitorsMutex.Lock()
defer global.MonitorsMutex.Unlock()
if _, exists := monitors[serviceName]; exists {
return fmt.Errorf("service monitor '%s' already exists", serviceName)
if _, exists := global.Monitors[serviceName]; exists {
if global.Monitors[serviceName].Running {
return fmt.Errorf("service monitor '%s' already exists", serviceName)
}
}
stopChan := make(chan struct{})
monitor := &ServiceMonitor{
monitor := &global.ServiceMonitor{
ServiceName: serviceName,
StopChan: stopChan,
Running: true,
}
monitors[serviceName] = monitor
global.Monitors[serviceName] = monitor
go runMonitor(monitor)
@@ -46,23 +39,24 @@ func AddServiceMonitor(serviceName string) error {
}
func RemoveServiceMonitor(serviceName string) error {
monitorsMutex.Lock()
defer monitorsMutex.Unlock()
global.MonitorsMutex.Lock()
defer global.MonitorsMutex.Unlock()
monitor, exists := monitors[serviceName]
monitor, exists := global.Monitors[serviceName]
if !exists {
return fmt.Errorf("service monitor '%s' not found", serviceName)
}
close(monitor.StopChan)
// close(monitor.StopChan)
monitor.Running = false
delete(monitors, serviceName)
monitor.Recovery = false
// delete(global.Monitors, serviceName)
postLog.Info(fmt.Sprintf("[Monitor] Removed service monitor for: %s", serviceName))
return nil
}
func runMonitor(m *ServiceMonitor) {
func runMonitor(m *global.ServiceMonitor) {
postLog.Info(fmt.Sprintf("[Monitor] Started monitoring service: %s", m.ServiceName))
ticker := time.NewTicker(5 * time.Second)
@@ -124,7 +118,7 @@ func checkServiceLogs(serviceName string) (bool, error) {
}
func IsServiceExist(serviceName string) bool {
_, exists := monitors[serviceName]
_, exists := global.Monitors[serviceName]
if !exists {
return false
}
@@ -147,14 +141,15 @@ func IsServiceRunning(serviceName string) bool {
func throwException(serviceName, errorContent string) error {
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
monitorsMutex.Lock()
monitor, exists := monitors[serviceName]
global.MonitorsMutex.Lock()
monitor, exists := global.Monitors[serviceName]
if exists {
close(monitor.StopChan)
monitor.Running = false
delete(monitors, serviceName)
monitor.Recovery = true
// delete(global.Monitors, serviceName)
}
monitorsMutex.Unlock()
global.MonitorsMutex.Unlock()
err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning)
if err != nil {
@@ -165,15 +160,15 @@ func throwException(serviceName, errorContent string) error {
return nil
}
monitorsMutex.Lock()
global.MonitorsMutex.Lock()
stopChan := make(chan struct{})
newMonitor := &ServiceMonitor{
newMonitor := &global.ServiceMonitor{
ServiceName: serviceName,
StopChan: stopChan,
Running: true,
}
monitors[serviceName] = newMonitor
monitorsMutex.Unlock()
global.Monitors[serviceName] = newMonitor
global.MonitorsMutex.Unlock()
go runMonitor(newMonitor)
postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName))
@@ -182,12 +177,14 @@ func throwException(serviceName, errorContent string) error {
}
func GetActiveMonitors() []string {
monitorsMutex.RLock()
defer monitorsMutex.RUnlock()
global.MonitorsMutex.RLock()
defer global.MonitorsMutex.RUnlock()
var activeServices []string
for name := range monitors {
activeServices = append(activeServices, name)
for name, monitor := range global.Monitors {
if monitor.Running {
activeServices = append(activeServices, name)
}
}
return activeServices
}