monitoring-observability

📁 rohitg00/awesome-claude-code-toolkit 📅 2 days ago

总安装量

周安装量

#51433

全站排名

安装命令

npx skills add https://github.com/rohitg00/awesome-claude-code-toolkit --skill monitoring-observability

Agent 安装分布

replit 1

trae 1

trae-cn 1

claude-code 1

Skill 文档

Monitoring & Observability

OpenTelemetry Setup

import { NodeSDK } from "@opentelemetry/sdk-node";
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-http";
import { HttpInstrumentation } from "@opentelemetry/instrumentation-http";
import { PgInstrumentation } from "@opentelemetry/instrumentation-pg";
import { PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics";

const sdk = new NodeSDK({
  serviceName: "order-service",
  traceExporter: new OTLPTraceExporter({
    url: "http://otel-collector:4318/v1/traces",
  }),
  metricReader: new PeriodicExportingMetricReader({
    exporter: new OTLPMetricExporter({
      url: "http://otel-collector:4318/v1/metrics",
    }),
    exportIntervalMillis: 15000,
  }),
  instrumentations: [
    new HttpInstrumentation(),
    new PgInstrumentation(),
  ],
});

sdk.start();
process.on("SIGTERM", () => sdk.shutdown());

Custom Spans and Metrics

import { trace, metrics, SpanStatusCode } from "@opentelemetry/api";

const tracer = trace.getTracer("order-service");
const meter = metrics.getMeter("order-service");

const orderCounter = meter.createCounter("orders.created", {
  description: "Number of orders created",
});

const orderDuration = meter.createHistogram("orders.processing_duration_ms", {
  description: "Order processing duration in milliseconds",
  unit: "ms",
});

async function createOrder(input: CreateOrderInput) {
  return tracer.startActiveSpan("createOrder", async (span) => {
    try {
      span.setAttributes({
        "order.customer_id": input.customerId,
        "order.item_count": input.items.length,
      });

      const start = performance.now();
      const order = await db.order.create({ data: input });

      orderCounter.add(1, { status: "success" });
      orderDuration.record(performance.now() - start);

      span.setStatus({ code: SpanStatusCode.OK });
      return order;
    } catch (error) {
      span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
      orderCounter.add(1, { status: "error" });
      throw error;
    } finally {
      span.end();
    }
  });
}

Prometheus Metrics

# prometheus.yml
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: "api-servers"
    static_configs:
      - targets: ["api-1:9090", "api-2:9090"]
    metrics_path: /metrics

  - job_name: "node-exporter"
    static_configs:
      - targets: ["node-exporter:9100"]

import { collectDefaultMetrics, Counter, Histogram, Registry } from "prom-client";

const registry = new Registry();
collectDefaultMetrics({ register: registry });

const httpRequestDuration = new Histogram({
  name: "http_request_duration_seconds",
  help: "HTTP request duration in seconds",
  labelNames: ["method", "route", "status"],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 5],
  registers: [registry],
});

app.use((req, res, next) => {
  const end = httpRequestDuration.startTimer();
  res.on("finish", () => {
    end({ method: req.method, route: req.route?.path ?? req.path, status: res.statusCode });
  });
  next();
});

app.get("/metrics", async (req, res) => {
  res.set("Content-Type", registry.contentType);
  res.end(await registry.metrics());
});

Structured Logging

import pino from "pino";

const logger = pino({
  level: process.env.LOG_LEVEL ?? "info",
  formatters: {
    level: (label) => ({ level: label }),
  },
  redact: ["req.headers.authorization", "password", "token"],
});

function requestLogger(req, res, next) {
  const start = Date.now();
  res.on("finish", () => {
    logger.info({
      method: req.method,
      url: req.url,
      status: res.statusCode,
      duration_ms: Date.now() - start,
      trace_id: req.headers["x-trace-id"],
    });
  });
  next();
}

Alerting Rules

groups:
  - name: api-alerts
    rules:
      - alert: HighErrorRate
        expr: rate(http_request_duration_seconds_count{status=~"5.."}[5m]) / rate(http_request_duration_seconds_count[5m]) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Error rate above 5% for {{ $labels.route }}"

      - alert: HighLatency
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 10m
        labels:
          severity: warning

Anti-Patterns

Logging sensitive data (passwords, tokens, PII) without redaction
Using string interpolation in log messages instead of structured fields
Creating unbounded cardinality in metric labels (e.g., user IDs as labels)
Not correlating logs and traces with a shared trace ID
Alerting on symptoms (high CPU) without understanding root cause
Missing SLO definitions before building dashboards

Checklist

OpenTelemetry SDK initialized with auto-instrumentation for HTTP, DB, and messaging
Custom spans added for business-critical operations
Metrics use bounded label cardinality
Structured logging with JSON output and secret redaction
Trace context propagated across service boundaries
Alerting rules based on SLOs (error rate, latency percentiles)
Dashboards show RED metrics (Rate, Errors, Duration) per service
Log retention and rotation policies configured

GitHub 仓库 ↗ ← 返回陌讯 Skills 聚合平台