Files
WatchDog_Linux-systemd/monitor/monitor.go
NanamiAdmin db192f2209 fix(monitor): format exception message with XML tags
Improve exception message formatting by adding XML tags for better parsing and processing downstream. The new format includes explicit tags for exception type, service name, and error message.
2026-04-28 20:49:11 +08:00

144 lines
3.6 KiB
Go

package monitor
import (
"Watchdog_Linux-systemd/postLog"
"Watchdog_Linux-systemd/socket"
"fmt"
"os/exec"
"strings"
"sync"
"time"
)
type ServiceMonitor struct {
ServiceName string
StopChan chan struct{}
Running bool
}
var (
monitors = make(map[string]*ServiceMonitor)
monitorsMutex sync.RWMutex
)
func AddServiceMonitor(serviceName string) error {
monitorsMutex.Lock()
defer monitorsMutex.Unlock()
if _, exists := monitors[serviceName]; exists {
return fmt.Errorf("service monitor '%s' already exists", serviceName)
}
stopChan := make(chan struct{})
monitor := &ServiceMonitor{
ServiceName: serviceName,
StopChan: stopChan,
Running: true,
}
monitors[serviceName] = monitor
go runMonitor(monitor)
postLog.Info(fmt.Sprintf("[Monitor] Added service monitor for: %s", serviceName))
return nil
}
func RemoveServiceMonitor(serviceName string) error {
monitorsMutex.Lock()
defer monitorsMutex.Unlock()
monitor, exists := monitors[serviceName]
if !exists {
return fmt.Errorf("service monitor '%s' not found", serviceName)
}
close(monitor.StopChan)
monitor.Running = false
delete(monitors, serviceName)
postLog.Info(fmt.Sprintf("[Monitor] Removed service monitor for: %s", serviceName))
return nil
}
func runMonitor(m *ServiceMonitor) {
postLog.Info(fmt.Sprintf("[Monitor] Started monitoring service: %s", m.ServiceName))
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-m.StopChan:
postLog.Info(fmt.Sprintf("[Monitor] Stopped monitoring service: %s", m.ServiceName))
return
case <-ticker.C:
serviceStatus, err := checkServiceStatus(m.ServiceName)
if err != nil {
throwException(m.ServiceName, fmt.Sprintf("Failed to check service status: %v", err))
continue
}
logStatus, err := checkServiceLogs(m.ServiceName)
if err != nil {
throwException(m.ServiceName, fmt.Sprintf("Failed to check service logs: %v", err))
continue
}
if !serviceStatus || !logStatus {
throwException(m.ServiceName, fmt.Sprintf("Service check failed - Status: %v, Logs: %v", serviceStatus, logStatus))
}
}
}
}
func checkServiceStatus(serviceName string) (bool, error) {
cmd := exec.Command("systemctl", "is-active", serviceName)
output, err := cmd.Output()
if err != nil {
return false, fmt.Errorf("failed to execute systemctl: %w", err)
}
status := strings.TrimSpace(string(output))
return status == "active", nil
}
func checkServiceLogs(serviceName string) (bool, error) {
cmd := exec.Command("journalctl", "-u", serviceName, "-n", "50", "--no-pager")
output, err := cmd.Output()
if err != nil {
return false, fmt.Errorf("failed to execute journalctl: %w", err)
}
logContent := string(output)
errorKeywords := []string{"error", "fatal", "failed", "critical", "exception"}
for _, keyword := range errorKeywords {
if strings.Contains(strings.ToLower(logContent), keyword) {
return false, nil
}
}
return true, nil
}
func throwException(serviceName, errorContent string) error {
postLog.Error(fmt.Sprintf("[Monitor] Service: %s - Exception: %s", serviceName, errorContent))
err := socket.SendMsg(fmt.Sprintf("[Exception] <exceptionType>%s</exceptionType> <serviceName>%s</serviceName> <errorMsg>%s</errorMsg>", "service", serviceName, errorContent))
if err != nil {
return fmt.Errorf("failed to send exception message: %v", err)
}
return nil
}
func GetActiveMonitors() []string {
monitorsMutex.RLock()
defer monitorsMutex.RUnlock()
var activeServices []string
for name := range monitors {
activeServices = append(activeServices, name)
}
return activeServices
}