背景 微服务架构下,服务间调用链路错综复杂。一旦出问题,没有可观测性支撑,排查起来就是噩梦。
可观测性三驾马车:日志(Logs)、指标(Metrics)、追踪(Traces)。
日志:结构化日志是基础 别再用 fmt.Printf 了,结构化日志才是正道:
import "github.com/rs/zerolog" func main() { log := zerolog.New(os.Stdout). With(). Timestamp(). Caller(). Logger() log.Info(). Str("service", "user-service"). Int("request_id", 12345). Msg("User login successful") } 输出:
{"level":"info","service":"user-service","request_id":12345,"time":"2026-04-11T10:00:00Z","caller":"main.go:25","message":"User login successful"} 指标:Prometheus + Grafana import "github.com/prometheus/client_golang/prometheus" import "github.com/prometheus/client_golang/prometheus/promhttp" var ( httpRequests = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "http_requests_total", Help: "Total HTTP requests", }, []string{"method", "path", "status"}, ) httpDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "http_request_duration_seconds", Buckets: prometheus.DefBuckets, }, []string{"method", "path"}, ) ) func init() { prometheus.MustRegister(httpRequests, httpDuration) } // 中间件示例 func promMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { start := time.Now() rw := &responseWriter{ResponseWriter: w, statusCode: 200} next.ServeHTTP(rw, r) duration := time.Since(start).Seconds() httpRequests.WithLabelValues(r.Method, r.URL.Path, strconv.Itoa(rw.statusCode)).Inc() httpDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration) }) } 分布式追踪:OpenTelemetry import "go.opentelemetry.io/otel" import "go.opentelemetry.io/otel/exporters/jaeger" import "go.opentelemetry.io/otel/sdk/trace" func initTracer() (func(), error) { exp, err := jaeger.New(jaeger.WithAgentEndpoint()) if err != nil { return nil, err } tp := trace.NewTracerProvider( trace.WithBatcher(exp), trace.WithSampler(trace.AlwaysSample()), ) otel.SetTracerProvider(tp) return func() { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() tp.Shutdown(ctx) }, nil } // 在 HTTP handler 中使用 func handleGetUser(w http.ResponseWriter, r *http.Request) { ctx, span := otel.Tracer("user-service").Start(r.Context(), "GetUser") defer span.End() span.SetAttributes( attribute.String("user.id", r.URL.Query().Get("id")), ) user, err := getUserFromDB(ctx, r.URL.Query().Get("id")) if err != nil { span.RecordError(err) // ... } // 传递给后续调用 go someAsyncOperation(ctx, user) } 三者结合:一个完整示例 type UserService struct { logger zerolog.Logger tracer trace.Tracer metrics *UserMetrics userRepo *UserRepository } func (s *UserService) GetUser(ctx context.Context, id string) (*User, error) { // 1. 开始追踪 ctx, span := s.tracer.Start(ctx, "UserService.GetUser") defer span.End() span.SetAttributes(attribute.String("user.id", id)) // 2. 记录指标 s.metrics.requests.Inc() timer := s.metrics.duration.NewTimer() // 3. 结构化日志 s.logger.Info(). Str("user_id", id). Str("trace_id", span.SpanContext().TraceID().String()). Msg("Fetching user") // 4. 业务逻辑 user, err := s.userRepo.FindByID(ctx, id) if err != nil { // 记录错误,包含追踪上下文 s.logger.Error(). Err(err). Str("user_id", id). Str("trace_id", span.SpanContext().TraceID().String()). Msg("Failed to fetch user") span.RecordError(err) s.metrics.errors.Inc() return nil, err } timer.ObserveDuration() return user, nil } 可视化:用 Grafana 大盘 常见 Dashboard 布局:
...