fix(processor): recovery process can be ended in time when the watchdog received stop command.
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"Watchdog_Linux-systemd/postLog"
|
||||
"Watchdog_Linux-systemd/global"
|
||||
)
|
||||
|
||||
type ServiceStatusChecker func(serviceName string) bool
|
||||
@@ -18,16 +19,18 @@ func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecke
|
||||
}
|
||||
|
||||
for i := 0; i < 5; i++ {
|
||||
serviceControl.RetryCount++
|
||||
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
|
||||
cmd := exec.Command("systemctl", "restart", serviceName)
|
||||
err := cmd.Run()
|
||||
if err == nil {
|
||||
if isServiceRunning != nil && isServiceRunning(serviceName) {
|
||||
return nil
|
||||
if global.Monitors[serviceName].Recovery { // Only recovery process is started
|
||||
serviceControl.RetryCount++
|
||||
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
|
||||
cmd := exec.Command("systemctl", "restart", serviceName)
|
||||
err := cmd.Run()
|
||||
if err == nil {
|
||||
if isServiceRunning != nil && isServiceRunning(serviceName) {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
time.Sleep(time.Duration(i+1) * time.Second)
|
||||
}
|
||||
time.Sleep(time.Duration(i+1) * time.Second)
|
||||
}
|
||||
serviceControl.ErrorType = "restart"
|
||||
serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount)
|
||||
|
||||
17
global/vars.go
Normal file
17
global/vars.go
Normal file
@@ -0,0 +1,17 @@
|
||||
package global
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
type ServiceMonitor struct {
|
||||
ServiceName string
|
||||
StopChan chan struct{}
|
||||
Running bool
|
||||
Recovery bool
|
||||
}
|
||||
|
||||
var (
|
||||
Monitors = make(map[string]*ServiceMonitor)
|
||||
MonitorsMutex sync.RWMutex
|
||||
)
|
||||
@@ -3,41 +3,34 @@ package monitor
|
||||
import (
|
||||
"Watchdog_Linux-systemd/errorHandle"
|
||||
"Watchdog_Linux-systemd/postLog"
|
||||
"Watchdog_Linux-systemd/global"
|
||||
"Watchdog_Linux-systemd/socket"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type ServiceMonitor struct {
|
||||
ServiceName string
|
||||
StopChan chan struct{}
|
||||
Running bool
|
||||
}
|
||||
|
||||
var (
|
||||
monitors = make(map[string]*ServiceMonitor)
|
||||
monitorsMutex sync.RWMutex
|
||||
)
|
||||
|
||||
func AddServiceMonitor(serviceName string) error {
|
||||
monitorsMutex.Lock()
|
||||
defer monitorsMutex.Unlock()
|
||||
global.MonitorsMutex.Lock()
|
||||
defer global.MonitorsMutex.Unlock()
|
||||
|
||||
if _, exists := monitors[serviceName]; exists {
|
||||
return fmt.Errorf("service monitor '%s' already exists", serviceName)
|
||||
if _, exists := global.Monitors[serviceName]; exists {
|
||||
if global.Monitors[serviceName].Running {
|
||||
return fmt.Errorf("service monitor '%s' already exists", serviceName)
|
||||
}
|
||||
}
|
||||
|
||||
stopChan := make(chan struct{})
|
||||
monitor := &ServiceMonitor{
|
||||
monitor := &global.ServiceMonitor{
|
||||
ServiceName: serviceName,
|
||||
StopChan: stopChan,
|
||||
Running: true,
|
||||
}
|
||||
|
||||
monitors[serviceName] = monitor
|
||||
global.Monitors[serviceName] = monitor
|
||||
|
||||
go runMonitor(monitor)
|
||||
|
||||
@@ -46,23 +39,24 @@ func AddServiceMonitor(serviceName string) error {
|
||||
}
|
||||
|
||||
func RemoveServiceMonitor(serviceName string) error {
|
||||
monitorsMutex.Lock()
|
||||
defer monitorsMutex.Unlock()
|
||||
global.MonitorsMutex.Lock()
|
||||
defer global.MonitorsMutex.Unlock()
|
||||
|
||||
monitor, exists := monitors[serviceName]
|
||||
monitor, exists := global.Monitors[serviceName]
|
||||
if !exists {
|
||||
return fmt.Errorf("service monitor '%s' not found", serviceName)
|
||||
}
|
||||
|
||||
close(monitor.StopChan)
|
||||
// close(monitor.StopChan)
|
||||
monitor.Running = false
|
||||
delete(monitors, serviceName)
|
||||
monitor.Recovery = false
|
||||
// delete(global.Monitors, serviceName)
|
||||
|
||||
postLog.Info(fmt.Sprintf("[Monitor] Removed service monitor for: %s", serviceName))
|
||||
return nil
|
||||
}
|
||||
|
||||
func runMonitor(m *ServiceMonitor) {
|
||||
func runMonitor(m *global.ServiceMonitor) {
|
||||
postLog.Info(fmt.Sprintf("[Monitor] Started monitoring service: %s", m.ServiceName))
|
||||
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
@@ -124,7 +118,7 @@ func checkServiceLogs(serviceName string) (bool, error) {
|
||||
}
|
||||
|
||||
func IsServiceExist(serviceName string) bool {
|
||||
_, exists := monitors[serviceName]
|
||||
_, exists := global.Monitors[serviceName]
|
||||
if !exists {
|
||||
return false
|
||||
}
|
||||
@@ -147,14 +141,15 @@ func IsServiceRunning(serviceName string) bool {
|
||||
func throwException(serviceName, errorContent string) error {
|
||||
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
|
||||
|
||||
monitorsMutex.Lock()
|
||||
monitor, exists := monitors[serviceName]
|
||||
global.MonitorsMutex.Lock()
|
||||
monitor, exists := global.Monitors[serviceName]
|
||||
if exists {
|
||||
close(monitor.StopChan)
|
||||
monitor.Running = false
|
||||
delete(monitors, serviceName)
|
||||
monitor.Recovery = true
|
||||
// delete(global.Monitors, serviceName)
|
||||
}
|
||||
monitorsMutex.Unlock()
|
||||
global.MonitorsMutex.Unlock()
|
||||
|
||||
err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning)
|
||||
if err != nil {
|
||||
@@ -165,15 +160,15 @@ func throwException(serviceName, errorContent string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
monitorsMutex.Lock()
|
||||
global.MonitorsMutex.Lock()
|
||||
stopChan := make(chan struct{})
|
||||
newMonitor := &ServiceMonitor{
|
||||
newMonitor := &global.ServiceMonitor{
|
||||
ServiceName: serviceName,
|
||||
StopChan: stopChan,
|
||||
Running: true,
|
||||
}
|
||||
monitors[serviceName] = newMonitor
|
||||
monitorsMutex.Unlock()
|
||||
global.Monitors[serviceName] = newMonitor
|
||||
global.MonitorsMutex.Unlock()
|
||||
|
||||
go runMonitor(newMonitor)
|
||||
postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName))
|
||||
@@ -182,12 +177,14 @@ func throwException(serviceName, errorContent string) error {
|
||||
}
|
||||
|
||||
func GetActiveMonitors() []string {
|
||||
monitorsMutex.RLock()
|
||||
defer monitorsMutex.RUnlock()
|
||||
global.MonitorsMutex.RLock()
|
||||
defer global.MonitorsMutex.RUnlock()
|
||||
|
||||
var activeServices []string
|
||||
for name := range monitors {
|
||||
activeServices = append(activeServices, name)
|
||||
for name, monitor := range global.Monitors {
|
||||
if monitor.Running {
|
||||
activeServices = append(activeServices, name)
|
||||
}
|
||||
}
|
||||
return activeServices
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user