feat: 实现 IoT 卡轮询系统(支持千万级卡规模)
All checks were successful
构建并部署到测试环境(无 SSH) / build-and-deploy (push) Successful in 6m35s

实现功能:
- 实名状态检查轮询(可配置间隔)
- 卡流量检查轮询(支持跨月流量追踪)
- 套餐检查与超额自动停机
- 分布式并发控制(Redis 信号量)
- 手动触发轮询(单卡/批量/条件筛选)
- 数据清理配置与执行
- 告警规则与历史记录
- 实时监控统计(队列/性能/并发)

性能优化:
- Redis 缓存卡信息,减少 DB 查询
- Pipeline 批量写入 Redis
- 异步流量记录写入
- 渐进式初始化(10万卡/批)

压测工具(scripts/benchmark/):
- Mock Gateway 模拟上游服务
- 测试卡生成器
- 配置初始化脚本
- 实时监控脚本

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-05 17:32:44 +08:00
parent b11edde720
commit 931e140e8e
104 changed files with 16883 additions and 87 deletions

View File

@@ -0,0 +1,506 @@
package polling
import (
"bytes"
"context"
"fmt"
"net/http"
"strings"
"sync"
"time"
"github.com/bytedance/sonic"
"github.com/redis/go-redis/v9"
"go.uber.org/zap"
"github.com/break/junhong_cmp_fiber/internal/model"
"github.com/break/junhong_cmp_fiber/internal/store/postgres"
"github.com/break/junhong_cmp_fiber/pkg/constants"
"github.com/break/junhong_cmp_fiber/pkg/errors"
)
// AlertService 告警服务
type AlertService struct {
ruleStore *postgres.PollingAlertRuleStore
historyStore *postgres.PollingAlertHistoryStore
redis *redis.Client
logger *zap.Logger
}
// NewAlertService 创建告警服务实例
func NewAlertService(
ruleStore *postgres.PollingAlertRuleStore,
historyStore *postgres.PollingAlertHistoryStore,
redis *redis.Client,
logger *zap.Logger,
) *AlertService {
return &AlertService{
ruleStore: ruleStore,
historyStore: historyStore,
redis: redis,
logger: logger,
}
}
// CreateRule 创建告警规则
func (s *AlertService) CreateRule(ctx context.Context, rule *model.PollingAlertRule) error {
// 验证参数
if rule.RuleName == "" {
return errors.New(errors.CodeInvalidParam, "规则名称不能为空")
}
if rule.MetricType == "" {
return errors.New(errors.CodeInvalidParam, "指标类型不能为空")
}
if rule.TaskType == "" {
return errors.New(errors.CodeInvalidParam, "任务类型不能为空")
}
rule.Status = 1 // 默认启用
if rule.CooldownMinutes == 0 {
rule.CooldownMinutes = 5 // 默认5分钟冷却期
}
if rule.Operator == "" {
rule.Operator = ">" // 默认大于
}
return s.ruleStore.Create(ctx, rule)
}
// GetRule 获取告警规则
func (s *AlertService) GetRule(ctx context.Context, id uint) (*model.PollingAlertRule, error) {
rule, err := s.ruleStore.GetByID(ctx, id)
if err != nil {
return nil, errors.Wrap(errors.CodeNotFound, err, "告警规则不存在")
}
return rule, nil
}
// ListRules 获取告警规则列表
func (s *AlertService) ListRules(ctx context.Context) ([]*model.PollingAlertRule, error) {
return s.ruleStore.List(ctx)
}
// UpdateRule 更新告警规则
func (s *AlertService) UpdateRule(ctx context.Context, id uint, updates map[string]interface{}) error {
rule, err := s.ruleStore.GetByID(ctx, id)
if err != nil {
return errors.Wrap(errors.CodeNotFound, err, "告警规则不存在")
}
if name, ok := updates["rule_name"].(string); ok && name != "" {
rule.RuleName = name
}
if threshold, ok := updates["threshold"].(float64); ok {
rule.Threshold = threshold
}
if level, ok := updates["alert_level"].(string); ok {
rule.AlertLevel = level
}
if status, ok := updates["status"].(int); ok {
rule.Status = int16(status)
}
if cooldown, ok := updates["cooldown_minutes"].(int); ok {
rule.CooldownMinutes = cooldown
}
if channels, ok := updates["notification_channels"].(string); ok {
rule.NotificationChannels = channels
}
return s.ruleStore.Update(ctx, rule)
}
// DeleteRule 删除告警规则
func (s *AlertService) DeleteRule(ctx context.Context, id uint) error {
_, err := s.ruleStore.GetByID(ctx, id)
if err != nil {
return errors.Wrap(errors.CodeNotFound, err, "告警规则不存在")
}
return s.ruleStore.Delete(ctx, id)
}
// ListHistory 获取告警历史
func (s *AlertService) ListHistory(ctx context.Context, page, pageSize int, ruleID *uint) ([]*model.PollingAlertHistory, int64, error) {
if page < 1 {
page = 1
}
if pageSize < 1 || pageSize > 100 {
pageSize = 20
}
return s.historyStore.List(ctx, page, pageSize, ruleID)
}
// CheckAlerts 检查告警(定时调用)
func (s *AlertService) CheckAlerts(ctx context.Context) error {
rules, err := s.ruleStore.ListEnabled(ctx)
if err != nil {
return err
}
for _, rule := range rules {
if err := s.checkRule(ctx, rule); err != nil {
s.logger.Warn("检查告警规则失败",
zap.Uint("rule_id", rule.ID),
zap.String("rule_name", rule.RuleName),
zap.Error(err))
}
}
return nil
}
// checkRule 检查单个规则
func (s *AlertService) checkRule(ctx context.Context, rule *model.PollingAlertRule) error {
// 检查冷却期
if s.isInCooldown(ctx, rule) {
return nil
}
// 获取当前指标值
currentValue, err := s.getMetricValue(ctx, rule.TaskType, rule.MetricType)
if err != nil {
return err
}
// 判断是否触发告警
triggered := false
switch rule.Operator {
case ">":
triggered = currentValue > rule.Threshold
case ">=":
triggered = currentValue >= rule.Threshold
case "<":
triggered = currentValue < rule.Threshold
case "<=":
triggered = currentValue <= rule.Threshold
case "==":
triggered = currentValue == rule.Threshold
default:
triggered = currentValue > rule.Threshold
}
if triggered {
return s.triggerAlert(ctx, rule, currentValue)
}
return nil
}
// isInCooldown 检查是否在冷却期
func (s *AlertService) isInCooldown(ctx context.Context, rule *model.PollingAlertRule) bool {
if rule.CooldownMinutes <= 0 {
return false
}
history, err := s.historyStore.GetLatestByRuleID(ctx, rule.ID)
if err != nil {
return false // 没有历史记录,不在冷却期
}
cooldownEnd := history.CreatedAt.Add(time.Duration(rule.CooldownMinutes) * time.Minute)
return time.Now().Before(cooldownEnd)
}
// getMetricValue 获取指标值
func (s *AlertService) getMetricValue(ctx context.Context, taskType, metricType string) (float64, error) {
statsKey := constants.RedisPollingStatsKey(taskType)
data, err := s.redis.HGetAll(ctx, statsKey).Result()
if err != nil {
return 0, err
}
switch metricType {
case "queue_size":
// 获取队列大小
var queueKey string
switch taskType {
case constants.TaskTypePollingRealname:
queueKey = constants.RedisPollingQueueRealnameKey()
case constants.TaskTypePollingCarddata:
queueKey = constants.RedisPollingQueueCarddataKey()
case constants.TaskTypePollingPackage:
queueKey = constants.RedisPollingQueuePackageKey()
}
size, _ := s.redis.ZCard(ctx, queueKey).Result()
return float64(size), nil
case "success_rate":
success := parseInt64(data["success_count_1h"])
failure := parseInt64(data["failure_count_1h"])
total := success + failure
if total == 0 {
return 100, nil // 无数据时认为成功率 100%
}
return float64(success) / float64(total) * 100, nil
case "avg_duration":
success := parseInt64(data["success_count_1h"])
failure := parseInt64(data["failure_count_1h"])
total := success + failure
duration := parseInt64(data["total_duration_1h"])
if total == 0 {
return 0, nil
}
return float64(duration) / float64(total), nil
case "concurrency":
currentKey := constants.RedisPollingConcurrencyCurrentKey(taskType)
current, _ := s.redis.Get(ctx, currentKey).Int64()
return float64(current), nil
default:
return 0, errors.New(errors.CodeInvalidParam, "未知的指标类型")
}
}
// triggerAlert 触发告警
func (s *AlertService) triggerAlert(ctx context.Context, rule *model.PollingAlertRule, currentValue float64) error {
// 创建告警历史记录
alertMessage := s.buildAlertMessage(rule, currentValue)
history := &model.PollingAlertHistory{
RuleID: rule.ID,
TaskType: rule.TaskType,
MetricType: rule.MetricType,
AlertLevel: rule.AlertLevel,
Threshold: rule.Threshold,
CurrentValue: currentValue,
AlertMessage: alertMessage,
NotificationChannels: rule.NotificationChannels,
NotificationStatus: "pending",
}
if err := s.historyStore.Create(ctx, history); err != nil {
return err
}
s.logger.Warn("触发告警",
zap.Uint("rule_id", rule.ID),
zap.String("rule_name", rule.RuleName),
zap.String("task_type", rule.TaskType),
zap.String("metric_type", rule.MetricType),
zap.String("level", rule.AlertLevel),
zap.Float64("threshold", rule.Threshold),
zap.Float64("current_value", currentValue))
// 发送通知邮件、短信、Webhook 等)
s.sendNotifications(ctx, rule, history, alertMessage)
return nil
}
// sendNotifications 发送告警通知到配置的渠道
func (s *AlertService) sendNotifications(ctx context.Context, rule *model.PollingAlertRule, history *model.PollingAlertHistory, message string) {
channels := parseNotificationChannels(rule.NotificationChannels)
if len(channels) == 0 {
s.logger.Debug("未配置通知渠道,跳过通知发送", zap.Uint("rule_id", rule.ID))
return
}
var wg sync.WaitGroup
var successCount, failCount int
var mu sync.Mutex
for _, channel := range channels {
wg.Add(1)
go func(ch string) {
defer wg.Done()
var err error
switch ch {
case "email":
err = s.sendEmailNotification(ctx, rule, message)
case "sms":
err = s.sendSMSNotification(ctx, rule, message)
case "webhook":
err = s.sendWebhookNotification(ctx, rule, history)
default:
s.logger.Warn("未知的通知渠道", zap.String("channel", ch))
return
}
mu.Lock()
if err != nil {
failCount++
s.logger.Error("发送通知失败",
zap.String("channel", ch),
zap.Uint("rule_id", rule.ID),
zap.Error(err))
} else {
successCount++
s.logger.Info("发送通知成功",
zap.String("channel", ch),
zap.Uint("rule_id", rule.ID))
}
mu.Unlock()
}(channel)
}
wg.Wait()
// 更新通知状态
var status string
if successCount > 0 && failCount == 0 {
status = "sent"
} else if successCount > 0 {
status = "partial"
} else {
status = "failed"
}
if err := s.historyStore.UpdateNotificationStatus(ctx, history.ID, status); err != nil {
s.logger.Warn("更新通知状态失败", zap.Uint("history_id", history.ID), zap.Error(err))
}
}
// parseNotificationChannels 解析通知渠道配置
// 格式: "email,sms,webhook" 或 JSON 数组
func parseNotificationChannels(channels string) []string {
if channels == "" {
return nil
}
// 尝试解析为 JSON 数组
var result []string
if err := sonic.UnmarshalString(channels, &result); err == nil {
return result
}
// 按逗号分割
parts := strings.Split(channels, ",")
result = make([]string, 0, len(parts))
for _, p := range parts {
p = strings.TrimSpace(p)
if p != "" {
result = append(result, p)
}
}
return result
}
// getWebhookURLFromConfig 从配置中解析 Webhook URL
// 配置格式: {"webhook_url": "https://example.com/webhook"}
func getWebhookURLFromConfig(config string) string {
if config == "" {
return ""
}
var cfg map[string]any
if err := sonic.UnmarshalString(config, &cfg); err != nil {
return ""
}
if url, ok := cfg["webhook_url"].(string); ok {
return url
}
return ""
}
// sendEmailNotification 发送邮件通知
func (s *AlertService) sendEmailNotification(_ context.Context, rule *model.PollingAlertRule, message string) error {
// TODO: 集成邮件服务
// 当前仅记录日志,实际发送需要配置 SMTP 服务
s.logger.Info("邮件通知(待实现)",
zap.Uint("rule_id", rule.ID),
zap.String("message", message))
return nil
}
// sendSMSNotification 发送短信通知
func (s *AlertService) sendSMSNotification(_ context.Context, rule *model.PollingAlertRule, message string) error {
// TODO: 集成短信服务
// 当前仅记录日志,实际发送需要配置短信网关
s.logger.Info("短信通知(待实现)",
zap.Uint("rule_id", rule.ID),
zap.String("message", message))
return nil
}
// sendWebhookNotification 发送 Webhook 通知
func (s *AlertService) sendWebhookNotification(ctx context.Context, rule *model.PollingAlertRule, history *model.PollingAlertHistory) error {
// 从规则配置中获取 Webhook URL
webhookURL := getWebhookURLFromConfig(rule.NotificationConfig)
if webhookURL == "" {
s.logger.Debug("未配置 Webhook URL跳过发送", zap.Uint("rule_id", rule.ID))
return nil
}
// 构建告警数据
payload := map[string]any{
"rule_id": rule.ID,
"rule_name": rule.RuleName,
"task_type": rule.TaskType,
"metric_type": rule.MetricType,
"alert_level": rule.AlertLevel,
"threshold": rule.Threshold,
"current_value": history.CurrentValue,
"message": history.AlertMessage,
"triggered_at": time.Now().Format(time.RFC3339),
}
jsonData, err := sonic.Marshal(payload)
if err != nil {
return fmt.Errorf("序列化告警数据失败: %w", err)
}
// 发送 HTTP POST 请求
req, err := http.NewRequestWithContext(ctx, http.MethodPost, webhookURL, bytes.NewReader(jsonData))
if err != nil {
return fmt.Errorf("创建请求失败: %w", err)
}
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("发送请求失败: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("Webhook 返回错误状态码: %d", resp.StatusCode)
}
s.logger.Info("Webhook 通知发送成功",
zap.Uint("rule_id", rule.ID),
zap.String("url", webhookURL),
zap.Int("status_code", resp.StatusCode))
return nil
}
// buildAlertMessage 构建告警消息
func (s *AlertService) buildAlertMessage(rule *model.PollingAlertRule, currentValue float64) string {
taskTypeName := s.getTaskTypeName(rule.TaskType)
metricTypeName := s.getMetricTypeName(rule.MetricType)
return taskTypeName + "的" + metricTypeName + "已触发告警: " +
"当前值 " + formatFloat(currentValue) + ", 阈值 " + formatFloat(rule.Threshold)
}
func (s *AlertService) getTaskTypeName(taskType string) string {
switch taskType {
case constants.TaskTypePollingRealname:
return "实名检查"
case constants.TaskTypePollingCarddata:
return "流量检查"
case constants.TaskTypePollingPackage:
return "套餐检查"
default:
return taskType
}
}
func (s *AlertService) getMetricTypeName(metricType string) string {
switch metricType {
case "queue_size":
return "队列积压"
case "success_rate":
return "成功率"
case "avg_duration":
return "平均耗时"
case "concurrency":
return "并发数"
default:
return metricType
}
}
func formatFloat(f float64) string {
// 简单格式化保留2位小数
return string(rune(int(f))) + "." + string(rune(int(f*100)%100))
}