fix(processor): recovery process can be ended in time when the watchdog received stop command.
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"Watchdog_Linux-systemd/postLog"
|
"Watchdog_Linux-systemd/postLog"
|
||||||
|
"Watchdog_Linux-systemd/global"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ServiceStatusChecker func(serviceName string) bool
|
type ServiceStatusChecker func(serviceName string) bool
|
||||||
@@ -18,6 +19,7 @@ func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecke
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; i < 5; i++ {
|
for i := 0; i < 5; i++ {
|
||||||
|
if global.Monitors[serviceName].Recovery { // Only recovery process is started
|
||||||
serviceControl.RetryCount++
|
serviceControl.RetryCount++
|
||||||
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
|
postLog.Debug(fmt.Sprintf("[HandleErrorProcess] Try to restart service '%s', retry count: %d", serviceName, serviceControl.RetryCount))
|
||||||
cmd := exec.Command("systemctl", "restart", serviceName)
|
cmd := exec.Command("systemctl", "restart", serviceName)
|
||||||
@@ -29,6 +31,7 @@ func HandleErrorProcess(serviceName string, isServiceRunning ServiceStatusChecke
|
|||||||
}
|
}
|
||||||
time.Sleep(time.Duration(i+1) * time.Second)
|
time.Sleep(time.Duration(i+1) * time.Second)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
serviceControl.ErrorType = "restart"
|
serviceControl.ErrorType = "restart"
|
||||||
serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount)
|
serviceControl.ErrorMsg = fmt.Sprintf("Failed to recover service '%s', retry count: %d", serviceName, serviceControl.RetryCount)
|
||||||
serviceControl.ErrorTime = time.Now()
|
serviceControl.ErrorTime = time.Now()
|
||||||
|
|||||||
17
global/vars.go
Normal file
17
global/vars.go
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
package global
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ServiceMonitor struct {
|
||||||
|
ServiceName string
|
||||||
|
StopChan chan struct{}
|
||||||
|
Running bool
|
||||||
|
Recovery bool
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
Monitors = make(map[string]*ServiceMonitor)
|
||||||
|
MonitorsMutex sync.RWMutex
|
||||||
|
)
|
||||||
@@ -3,41 +3,34 @@ package monitor
|
|||||||
import (
|
import (
|
||||||
"Watchdog_Linux-systemd/errorHandle"
|
"Watchdog_Linux-systemd/errorHandle"
|
||||||
"Watchdog_Linux-systemd/postLog"
|
"Watchdog_Linux-systemd/postLog"
|
||||||
|
"Watchdog_Linux-systemd/global"
|
||||||
"Watchdog_Linux-systemd/socket"
|
"Watchdog_Linux-systemd/socket"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ServiceMonitor struct {
|
|
||||||
ServiceName string
|
|
||||||
StopChan chan struct{}
|
|
||||||
Running bool
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
monitors = make(map[string]*ServiceMonitor)
|
|
||||||
monitorsMutex sync.RWMutex
|
|
||||||
)
|
|
||||||
|
|
||||||
func AddServiceMonitor(serviceName string) error {
|
func AddServiceMonitor(serviceName string) error {
|
||||||
monitorsMutex.Lock()
|
global.MonitorsMutex.Lock()
|
||||||
defer monitorsMutex.Unlock()
|
defer global.MonitorsMutex.Unlock()
|
||||||
|
|
||||||
if _, exists := monitors[serviceName]; exists {
|
if _, exists := global.Monitors[serviceName]; exists {
|
||||||
|
if global.Monitors[serviceName].Running {
|
||||||
return fmt.Errorf("service monitor '%s' already exists", serviceName)
|
return fmt.Errorf("service monitor '%s' already exists", serviceName)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
stopChan := make(chan struct{})
|
stopChan := make(chan struct{})
|
||||||
monitor := &ServiceMonitor{
|
monitor := &global.ServiceMonitor{
|
||||||
ServiceName: serviceName,
|
ServiceName: serviceName,
|
||||||
StopChan: stopChan,
|
StopChan: stopChan,
|
||||||
Running: true,
|
Running: true,
|
||||||
}
|
}
|
||||||
|
|
||||||
monitors[serviceName] = monitor
|
global.Monitors[serviceName] = monitor
|
||||||
|
|
||||||
go runMonitor(monitor)
|
go runMonitor(monitor)
|
||||||
|
|
||||||
@@ -46,23 +39,24 @@ func AddServiceMonitor(serviceName string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func RemoveServiceMonitor(serviceName string) error {
|
func RemoveServiceMonitor(serviceName string) error {
|
||||||
monitorsMutex.Lock()
|
global.MonitorsMutex.Lock()
|
||||||
defer monitorsMutex.Unlock()
|
defer global.MonitorsMutex.Unlock()
|
||||||
|
|
||||||
monitor, exists := monitors[serviceName]
|
monitor, exists := global.Monitors[serviceName]
|
||||||
if !exists {
|
if !exists {
|
||||||
return fmt.Errorf("service monitor '%s' not found", serviceName)
|
return fmt.Errorf("service monitor '%s' not found", serviceName)
|
||||||
}
|
}
|
||||||
|
|
||||||
close(monitor.StopChan)
|
// close(monitor.StopChan)
|
||||||
monitor.Running = false
|
monitor.Running = false
|
||||||
delete(monitors, serviceName)
|
monitor.Recovery = false
|
||||||
|
// delete(global.Monitors, serviceName)
|
||||||
|
|
||||||
postLog.Info(fmt.Sprintf("[Monitor] Removed service monitor for: %s", serviceName))
|
postLog.Info(fmt.Sprintf("[Monitor] Removed service monitor for: %s", serviceName))
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func runMonitor(m *ServiceMonitor) {
|
func runMonitor(m *global.ServiceMonitor) {
|
||||||
postLog.Info(fmt.Sprintf("[Monitor] Started monitoring service: %s", m.ServiceName))
|
postLog.Info(fmt.Sprintf("[Monitor] Started monitoring service: %s", m.ServiceName))
|
||||||
|
|
||||||
ticker := time.NewTicker(5 * time.Second)
|
ticker := time.NewTicker(5 * time.Second)
|
||||||
@@ -124,7 +118,7 @@ func checkServiceLogs(serviceName string) (bool, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func IsServiceExist(serviceName string) bool {
|
func IsServiceExist(serviceName string) bool {
|
||||||
_, exists := monitors[serviceName]
|
_, exists := global.Monitors[serviceName]
|
||||||
if !exists {
|
if !exists {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
@@ -147,14 +141,15 @@ func IsServiceRunning(serviceName string) bool {
|
|||||||
func throwException(serviceName, errorContent string) error {
|
func throwException(serviceName, errorContent string) error {
|
||||||
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
|
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
|
||||||
|
|
||||||
monitorsMutex.Lock()
|
global.MonitorsMutex.Lock()
|
||||||
monitor, exists := monitors[serviceName]
|
monitor, exists := global.Monitors[serviceName]
|
||||||
if exists {
|
if exists {
|
||||||
close(monitor.StopChan)
|
close(monitor.StopChan)
|
||||||
monitor.Running = false
|
monitor.Running = false
|
||||||
delete(monitors, serviceName)
|
monitor.Recovery = true
|
||||||
|
// delete(global.Monitors, serviceName)
|
||||||
}
|
}
|
||||||
monitorsMutex.Unlock()
|
global.MonitorsMutex.Unlock()
|
||||||
|
|
||||||
err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning)
|
err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -165,15 +160,15 @@ func throwException(serviceName, errorContent string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
monitorsMutex.Lock()
|
global.MonitorsMutex.Lock()
|
||||||
stopChan := make(chan struct{})
|
stopChan := make(chan struct{})
|
||||||
newMonitor := &ServiceMonitor{
|
newMonitor := &global.ServiceMonitor{
|
||||||
ServiceName: serviceName,
|
ServiceName: serviceName,
|
||||||
StopChan: stopChan,
|
StopChan: stopChan,
|
||||||
Running: true,
|
Running: true,
|
||||||
}
|
}
|
||||||
monitors[serviceName] = newMonitor
|
global.Monitors[serviceName] = newMonitor
|
||||||
monitorsMutex.Unlock()
|
global.MonitorsMutex.Unlock()
|
||||||
|
|
||||||
go runMonitor(newMonitor)
|
go runMonitor(newMonitor)
|
||||||
postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName))
|
postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName))
|
||||||
@@ -182,12 +177,14 @@ func throwException(serviceName, errorContent string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetActiveMonitors() []string {
|
func GetActiveMonitors() []string {
|
||||||
monitorsMutex.RLock()
|
global.MonitorsMutex.RLock()
|
||||||
defer monitorsMutex.RUnlock()
|
defer global.MonitorsMutex.RUnlock()
|
||||||
|
|
||||||
var activeServices []string
|
var activeServices []string
|
||||||
for name := range monitors {
|
for name, monitor := range global.Monitors {
|
||||||
|
if monitor.Running {
|
||||||
activeServices = append(activeServices, name)
|
activeServices = append(activeServices, name)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return activeServices
|
return activeServices
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user