Files
WatchDog_Linux-systemd/monitor/monitor.go
NanamiAdmin 3ce076b8dc feat(errorHandle): add service error handling and recovery logic
Implement error handling mechanism with retry logic for failed services. Includes:
- ServiceControl struct to track service state
- HandleErrorProcess function to attempt service restarts
- Integration with monitor to automatically recover services
- Enhanced exception handling with status checks and monitoring restart
2026-04-29 12:11:25 +08:00

194 lines
4.6 KiB
Go

package monitor
import (
"Watchdog_Linux-systemd/errorHandle"
"Watchdog_Linux-systemd/postLog"
"Watchdog_Linux-systemd/socket"
"fmt"
"os/exec"
"strings"
"sync"
"time"
)
type ServiceMonitor struct {
ServiceName string
StopChan chan struct{}
Running bool
}
var (
monitors = make(map[string]*ServiceMonitor)
monitorsMutex sync.RWMutex
)
func AddServiceMonitor(serviceName string) error {
monitorsMutex.Lock()
defer monitorsMutex.Unlock()
if _, exists := monitors[serviceName]; exists {
return fmt.Errorf("service monitor '%s' already exists", serviceName)
}
stopChan := make(chan struct{})
monitor := &ServiceMonitor{
ServiceName: serviceName,
StopChan: stopChan,
Running: true,
}
monitors[serviceName] = monitor
go runMonitor(monitor)
postLog.Info(fmt.Sprintf("[Monitor] Added service monitor for: %s", serviceName))
return nil
}
func RemoveServiceMonitor(serviceName string) error {
monitorsMutex.Lock()
defer monitorsMutex.Unlock()
monitor, exists := monitors[serviceName]
if !exists {
return fmt.Errorf("service monitor '%s' not found", serviceName)
}
close(monitor.StopChan)
monitor.Running = false
delete(monitors, serviceName)
postLog.Info(fmt.Sprintf("[Monitor] Removed service monitor for: %s", serviceName))
return nil
}
func runMonitor(m *ServiceMonitor) {
postLog.Info(fmt.Sprintf("[Monitor] Started monitoring service: %s", m.ServiceName))
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-m.StopChan:
postLog.Info(fmt.Sprintf("[Monitor] Stopped monitoring service: %s", m.ServiceName))
return
case <-ticker.C:
serviceStatus, err := checkServiceStatus(m.ServiceName)
if err != nil {
throwException(m.ServiceName, fmt.Sprintf("Failed to check service status: %v", err))
continue
}
logStatus, err := checkServiceLogs(m.ServiceName)
if err != nil {
throwException(m.ServiceName, fmt.Sprintf("Failed to check service logs: %v", err))
continue
}
if !serviceStatus || !logStatus {
throwException(m.ServiceName, fmt.Sprintf("Service check failed - Status: %v, Logs: %v", serviceStatus, logStatus))
}
}
}
}
func checkServiceStatus(serviceName string) (bool, error) {
cmd := exec.Command("systemctl", "is-active", serviceName)
output, err := cmd.Output()
if err != nil {
return false, fmt.Errorf("failed to execute systemctl: %w", err)
}
status := strings.TrimSpace(string(output))
return status == "active", nil
}
func checkServiceLogs(serviceName string) (bool, error) {
cmd := exec.Command("journalctl", "-u", serviceName, "-n", "50", "--no-pager")
output, err := cmd.Output()
if err != nil {
return false, fmt.Errorf("failed to execute journalctl: %w", err)
}
logContent := string(output)
errorKeywords := []string{"error", "fatal", "failed", "critical", "exception"}
for _, keyword := range errorKeywords {
if strings.Contains(strings.ToLower(logContent), keyword) {
return false, nil
}
}
return true, nil
}
func IsServiceExist(serviceName string) bool {
_, exists := monitors[serviceName]
if !exists {
return false
}
return true
}
func IsServiceRunning(serviceName string) bool {
status, err := checkServiceStatus(serviceName)
if err != nil || !status {
return false
}
logStatus, err := checkServiceLogs(serviceName)
if err != nil || !logStatus {
return false
}
return true
}
func throwException(serviceName, errorContent string) error {
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
monitorsMutex.Lock()
monitor, exists := monitors[serviceName]
if exists {
close(monitor.StopChan)
monitor.Running = false
delete(monitors, serviceName)
}
monitorsMutex.Unlock()
err := errorHandle.HandleErrorProcess(serviceName, IsServiceRunning)
if err != nil {
err := socket.SendMsg(fmt.Sprintf("[Exception] <exceptionType>%s</exceptionType> <serviceName>%s</serviceName> <errorMsg>%s</errorMsg>", "service", serviceName, errorContent))
if err != nil {
return fmt.Errorf("failed to send exception message: %v", err)
}
return nil
}
monitorsMutex.Lock()
stopChan := make(chan struct{})
newMonitor := &ServiceMonitor{
ServiceName: serviceName,
StopChan: stopChan,
Running: true,
}
monitors[serviceName] = newMonitor
monitorsMutex.Unlock()
go runMonitor(newMonitor)
postLog.Info(fmt.Sprintf("[Monitor] Service recovered and monitor restarted: %s", serviceName))
return nil
}
func GetActiveMonitors() []string {
monitorsMutex.RLock()
defer monitorsMutex.RUnlock()
var activeServices []string
for name := range monitors {
activeServices = append(activeServices, name)
}
return activeServices
}