Files
junhong_cmp_fiber/internal/service/polling/alert_service.go
huang 931e140e8e
All checks were successful
构建并部署到测试环境(无 SSH) / build-and-deploy (push) Successful in 6m35s
feat: 实现 IoT 卡轮询系统(支持千万级卡规模)
实现功能:
- 实名状态检查轮询(可配置间隔)
- 卡流量检查轮询(支持跨月流量追踪)
- 套餐检查与超额自动停机
- 分布式并发控制(Redis 信号量)
- 手动触发轮询(单卡/批量/条件筛选)
- 数据清理配置与执行
- 告警规则与历史记录
- 实时监控统计(队列/性能/并发)

性能优化:
- Redis 缓存卡信息,减少 DB 查询
- Pipeline 批量写入 Redis
- 异步流量记录写入
- 渐进式初始化(10万卡/批)

压测工具(scripts/benchmark/):
- Mock Gateway 模拟上游服务
- 测试卡生成器
- 配置初始化脚本
- 实时监控脚本

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 17:32:44 +08:00

507 lines
14 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package polling
import (
"bytes"
"context"
"fmt"
"net/http"
"strings"
"sync"
"time"
"github.com/bytedance/sonic"
"github.com/redis/go-redis/v9"
"go.uber.org/zap"
"github.com/break/junhong_cmp_fiber/internal/model"
"github.com/break/junhong_cmp_fiber/internal/store/postgres"
"github.com/break/junhong_cmp_fiber/pkg/constants"
"github.com/break/junhong_cmp_fiber/pkg/errors"
)
// AlertService 告警服务
type AlertService struct {
ruleStore *postgres.PollingAlertRuleStore
historyStore *postgres.PollingAlertHistoryStore
redis *redis.Client
logger *zap.Logger
}
// NewAlertService 创建告警服务实例
func NewAlertService(
ruleStore *postgres.PollingAlertRuleStore,
historyStore *postgres.PollingAlertHistoryStore,
redis *redis.Client,
logger *zap.Logger,
) *AlertService {
return &AlertService{
ruleStore: ruleStore,
historyStore: historyStore,
redis: redis,
logger: logger,
}
}
// CreateRule 创建告警规则
func (s *AlertService) CreateRule(ctx context.Context, rule *model.PollingAlertRule) error {
// 验证参数
if rule.RuleName == "" {
return errors.New(errors.CodeInvalidParam, "规则名称不能为空")
}
if rule.MetricType == "" {
return errors.New(errors.CodeInvalidParam, "指标类型不能为空")
}
if rule.TaskType == "" {
return errors.New(errors.CodeInvalidParam, "任务类型不能为空")
}
rule.Status = 1 // 默认启用
if rule.CooldownMinutes == 0 {
rule.CooldownMinutes = 5 // 默认5分钟冷却期
}
if rule.Operator == "" {
rule.Operator = ">" // 默认大于
}
return s.ruleStore.Create(ctx, rule)
}
// GetRule 获取告警规则
func (s *AlertService) GetRule(ctx context.Context, id uint) (*model.PollingAlertRule, error) {
rule, err := s.ruleStore.GetByID(ctx, id)
if err != nil {
return nil, errors.Wrap(errors.CodeNotFound, err, "告警规则不存在")
}
return rule, nil
}
// ListRules 获取告警规则列表
func (s *AlertService) ListRules(ctx context.Context) ([]*model.PollingAlertRule, error) {
return s.ruleStore.List(ctx)
}
// UpdateRule 更新告警规则
func (s *AlertService) UpdateRule(ctx context.Context, id uint, updates map[string]interface{}) error {
rule, err := s.ruleStore.GetByID(ctx, id)
if err != nil {
return errors.Wrap(errors.CodeNotFound, err, "告警规则不存在")
}
if name, ok := updates["rule_name"].(string); ok && name != "" {
rule.RuleName = name
}
if threshold, ok := updates["threshold"].(float64); ok {
rule.Threshold = threshold
}
if level, ok := updates["alert_level"].(string); ok {
rule.AlertLevel = level
}
if status, ok := updates["status"].(int); ok {
rule.Status = int16(status)
}
if cooldown, ok := updates["cooldown_minutes"].(int); ok {
rule.CooldownMinutes = cooldown
}
if channels, ok := updates["notification_channels"].(string); ok {
rule.NotificationChannels = channels
}
return s.ruleStore.Update(ctx, rule)
}
// DeleteRule 删除告警规则
func (s *AlertService) DeleteRule(ctx context.Context, id uint) error {
_, err := s.ruleStore.GetByID(ctx, id)
if err != nil {
return errors.Wrap(errors.CodeNotFound, err, "告警规则不存在")
}
return s.ruleStore.Delete(ctx, id)
}
// ListHistory 获取告警历史
func (s *AlertService) ListHistory(ctx context.Context, page, pageSize int, ruleID *uint) ([]*model.PollingAlertHistory, int64, error) {
if page < 1 {
page = 1
}
if pageSize < 1 || pageSize > 100 {
pageSize = 20
}
return s.historyStore.List(ctx, page, pageSize, ruleID)
}
// CheckAlerts 检查告警(定时调用)
func (s *AlertService) CheckAlerts(ctx context.Context) error {
rules, err := s.ruleStore.ListEnabled(ctx)
if err != nil {
return err
}
for _, rule := range rules {
if err := s.checkRule(ctx, rule); err != nil {
s.logger.Warn("检查告警规则失败",
zap.Uint("rule_id", rule.ID),
zap.String("rule_name", rule.RuleName),
zap.Error(err))
}
}
return nil
}
// checkRule 检查单个规则
func (s *AlertService) checkRule(ctx context.Context, rule *model.PollingAlertRule) error {
// 检查冷却期
if s.isInCooldown(ctx, rule) {
return nil
}
// 获取当前指标值
currentValue, err := s.getMetricValue(ctx, rule.TaskType, rule.MetricType)
if err != nil {
return err
}
// 判断是否触发告警
triggered := false
switch rule.Operator {
case ">":
triggered = currentValue > rule.Threshold
case ">=":
triggered = currentValue >= rule.Threshold
case "<":
triggered = currentValue < rule.Threshold
case "<=":
triggered = currentValue <= rule.Threshold
case "==":
triggered = currentValue == rule.Threshold
default:
triggered = currentValue > rule.Threshold
}
if triggered {
return s.triggerAlert(ctx, rule, currentValue)
}
return nil
}
// isInCooldown 检查是否在冷却期
func (s *AlertService) isInCooldown(ctx context.Context, rule *model.PollingAlertRule) bool {
if rule.CooldownMinutes <= 0 {
return false
}
history, err := s.historyStore.GetLatestByRuleID(ctx, rule.ID)
if err != nil {
return false // 没有历史记录,不在冷却期
}
cooldownEnd := history.CreatedAt.Add(time.Duration(rule.CooldownMinutes) * time.Minute)
return time.Now().Before(cooldownEnd)
}
// getMetricValue 获取指标值
func (s *AlertService) getMetricValue(ctx context.Context, taskType, metricType string) (float64, error) {
statsKey := constants.RedisPollingStatsKey(taskType)
data, err := s.redis.HGetAll(ctx, statsKey).Result()
if err != nil {
return 0, err
}
switch metricType {
case "queue_size":
// 获取队列大小
var queueKey string
switch taskType {
case constants.TaskTypePollingRealname:
queueKey = constants.RedisPollingQueueRealnameKey()
case constants.TaskTypePollingCarddata:
queueKey = constants.RedisPollingQueueCarddataKey()
case constants.TaskTypePollingPackage:
queueKey = constants.RedisPollingQueuePackageKey()
}
size, _ := s.redis.ZCard(ctx, queueKey).Result()
return float64(size), nil
case "success_rate":
success := parseInt64(data["success_count_1h"])
failure := parseInt64(data["failure_count_1h"])
total := success + failure
if total == 0 {
return 100, nil // 无数据时认为成功率 100%
}
return float64(success) / float64(total) * 100, nil
case "avg_duration":
success := parseInt64(data["success_count_1h"])
failure := parseInt64(data["failure_count_1h"])
total := success + failure
duration := parseInt64(data["total_duration_1h"])
if total == 0 {
return 0, nil
}
return float64(duration) / float64(total), nil
case "concurrency":
currentKey := constants.RedisPollingConcurrencyCurrentKey(taskType)
current, _ := s.redis.Get(ctx, currentKey).Int64()
return float64(current), nil
default:
return 0, errors.New(errors.CodeInvalidParam, "未知的指标类型")
}
}
// triggerAlert 触发告警
func (s *AlertService) triggerAlert(ctx context.Context, rule *model.PollingAlertRule, currentValue float64) error {
// 创建告警历史记录
alertMessage := s.buildAlertMessage(rule, currentValue)
history := &model.PollingAlertHistory{
RuleID: rule.ID,
TaskType: rule.TaskType,
MetricType: rule.MetricType,
AlertLevel: rule.AlertLevel,
Threshold: rule.Threshold,
CurrentValue: currentValue,
AlertMessage: alertMessage,
NotificationChannels: rule.NotificationChannels,
NotificationStatus: "pending",
}
if err := s.historyStore.Create(ctx, history); err != nil {
return err
}
s.logger.Warn("触发告警",
zap.Uint("rule_id", rule.ID),
zap.String("rule_name", rule.RuleName),
zap.String("task_type", rule.TaskType),
zap.String("metric_type", rule.MetricType),
zap.String("level", rule.AlertLevel),
zap.Float64("threshold", rule.Threshold),
zap.Float64("current_value", currentValue))
// 发送通知邮件、短信、Webhook 等)
s.sendNotifications(ctx, rule, history, alertMessage)
return nil
}
// sendNotifications 发送告警通知到配置的渠道
func (s *AlertService) sendNotifications(ctx context.Context, rule *model.PollingAlertRule, history *model.PollingAlertHistory, message string) {
channels := parseNotificationChannels(rule.NotificationChannels)
if len(channels) == 0 {
s.logger.Debug("未配置通知渠道,跳过通知发送", zap.Uint("rule_id", rule.ID))
return
}
var wg sync.WaitGroup
var successCount, failCount int
var mu sync.Mutex
for _, channel := range channels {
wg.Add(1)
go func(ch string) {
defer wg.Done()
var err error
switch ch {
case "email":
err = s.sendEmailNotification(ctx, rule, message)
case "sms":
err = s.sendSMSNotification(ctx, rule, message)
case "webhook":
err = s.sendWebhookNotification(ctx, rule, history)
default:
s.logger.Warn("未知的通知渠道", zap.String("channel", ch))
return
}
mu.Lock()
if err != nil {
failCount++
s.logger.Error("发送通知失败",
zap.String("channel", ch),
zap.Uint("rule_id", rule.ID),
zap.Error(err))
} else {
successCount++
s.logger.Info("发送通知成功",
zap.String("channel", ch),
zap.Uint("rule_id", rule.ID))
}
mu.Unlock()
}(channel)
}
wg.Wait()
// 更新通知状态
var status string
if successCount > 0 && failCount == 0 {
status = "sent"
} else if successCount > 0 {
status = "partial"
} else {
status = "failed"
}
if err := s.historyStore.UpdateNotificationStatus(ctx, history.ID, status); err != nil {
s.logger.Warn("更新通知状态失败", zap.Uint("history_id", history.ID), zap.Error(err))
}
}
// parseNotificationChannels 解析通知渠道配置
// 格式: "email,sms,webhook" 或 JSON 数组
func parseNotificationChannels(channels string) []string {
if channels == "" {
return nil
}
// 尝试解析为 JSON 数组
var result []string
if err := sonic.UnmarshalString(channels, &result); err == nil {
return result
}
// 按逗号分割
parts := strings.Split(channels, ",")
result = make([]string, 0, len(parts))
for _, p := range parts {
p = strings.TrimSpace(p)
if p != "" {
result = append(result, p)
}
}
return result
}
// getWebhookURLFromConfig 从配置中解析 Webhook URL
// 配置格式: {"webhook_url": "https://example.com/webhook"}
func getWebhookURLFromConfig(config string) string {
if config == "" {
return ""
}
var cfg map[string]any
if err := sonic.UnmarshalString(config, &cfg); err != nil {
return ""
}
if url, ok := cfg["webhook_url"].(string); ok {
return url
}
return ""
}
// sendEmailNotification 发送邮件通知
func (s *AlertService) sendEmailNotification(_ context.Context, rule *model.PollingAlertRule, message string) error {
// TODO: 集成邮件服务
// 当前仅记录日志,实际发送需要配置 SMTP 服务
s.logger.Info("邮件通知(待实现)",
zap.Uint("rule_id", rule.ID),
zap.String("message", message))
return nil
}
// sendSMSNotification 发送短信通知
func (s *AlertService) sendSMSNotification(_ context.Context, rule *model.PollingAlertRule, message string) error {
// TODO: 集成短信服务
// 当前仅记录日志,实际发送需要配置短信网关
s.logger.Info("短信通知(待实现)",
zap.Uint("rule_id", rule.ID),
zap.String("message", message))
return nil
}
// sendWebhookNotification 发送 Webhook 通知
func (s *AlertService) sendWebhookNotification(ctx context.Context, rule *model.PollingAlertRule, history *model.PollingAlertHistory) error {
// 从规则配置中获取 Webhook URL
webhookURL := getWebhookURLFromConfig(rule.NotificationConfig)
if webhookURL == "" {
s.logger.Debug("未配置 Webhook URL跳过发送", zap.Uint("rule_id", rule.ID))
return nil
}
// 构建告警数据
payload := map[string]any{
"rule_id": rule.ID,
"rule_name": rule.RuleName,
"task_type": rule.TaskType,
"metric_type": rule.MetricType,
"alert_level": rule.AlertLevel,
"threshold": rule.Threshold,
"current_value": history.CurrentValue,
"message": history.AlertMessage,
"triggered_at": time.Now().Format(time.RFC3339),
}
jsonData, err := sonic.Marshal(payload)
if err != nil {
return fmt.Errorf("序列化告警数据失败: %w", err)
}
// 发送 HTTP POST 请求
req, err := http.NewRequestWithContext(ctx, http.MethodPost, webhookURL, bytes.NewReader(jsonData))
if err != nil {
return fmt.Errorf("创建请求失败: %w", err)
}
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("发送请求失败: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("Webhook 返回错误状态码: %d", resp.StatusCode)
}
s.logger.Info("Webhook 通知发送成功",
zap.Uint("rule_id", rule.ID),
zap.String("url", webhookURL),
zap.Int("status_code", resp.StatusCode))
return nil
}
// buildAlertMessage 构建告警消息
func (s *AlertService) buildAlertMessage(rule *model.PollingAlertRule, currentValue float64) string {
taskTypeName := s.getTaskTypeName(rule.TaskType)
metricTypeName := s.getMetricTypeName(rule.MetricType)
return taskTypeName + "的" + metricTypeName + "已触发告警: " +
"当前值 " + formatFloat(currentValue) + ", 阈值 " + formatFloat(rule.Threshold)
}
func (s *AlertService) getTaskTypeName(taskType string) string {
switch taskType {
case constants.TaskTypePollingRealname:
return "实名检查"
case constants.TaskTypePollingCarddata:
return "流量检查"
case constants.TaskTypePollingPackage:
return "套餐检查"
default:
return taskType
}
}
func (s *AlertService) getMetricTypeName(metricType string) string {
switch metricType {
case "queue_size":
return "队列积压"
case "success_rate":
return "成功率"
case "avg_duration":
return "平均耗时"
case "concurrency":
return "并发数"
default:
return metricType
}
}
func formatFloat(f float64) string {
// 简单格式化保留2位小数
return string(rune(int(f))) + "." + string(rune(int(f*100)%100))
}