SkyWalking链路追踪完整指南
一、SkyWalking架构与原理
1.1 SkyWalking全景架构
数据流:
[应用] → [Java Agent/其他语言Agent] → [OAP Server] → [存储] → [UI]
↓
[告警] [集群管理] [配置]
1.2 核心组件
| 组件 | 作用 | 部署方式 |
|---|---|---|
| Agent | 收集应用数据,字节码增强 | 与应用一起部署 |
| OAP Server | 接收、分析、聚合数据 | 独立部署,可集群 |
| Storage | 存储数据(ES/H2/MySQL等) | 独立部署 |
| UI | 可视化展示 | 独立部署 |
二、SkyWalking部署与配置
2.1 快速部署(Docker方式)
# docker-compose-skywalking.yml
version: '3.8'
services:
# Elasticsearch存储
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.17.3
container_name: skywalking-es
restart: always
ports:
- "9200:9200"
environment:
- discovery.type=single-node
- ES_JAVA_OPTS=-Xms2g -Xmx2g
- xpack.security.enabled=false
volumes:
- es_data:/usr/share/elasticsearch/data
networks:
- skywalking-net
# OAP Server
oap:
image: apache/skywalking-oap-server:9.4.0
container_name: skywalking-oap
depends_on:
- elasticsearch
restart: always
ports:
- "11800:11800" # gRPC接收Agent数据
- "12800:12800" # HTTP接收Agent数据
environment:
- SW_STORAGE=elasticsearch
- SW_STORAGE_ES_CLUSTER_NODES=elasticsearch:9200
- SW_CLUSTER=standalone
- JAVA_OPTS=-Xms2g -Xmx2g
volumes:
- ./config/alarm-settings.yml:/skywalking/config/alarm-settings.yml
- ./config/oap-log4j2.xml:/skywalking/config/log4j2.xml
networks:
- skywalking-net
# UI
ui:
image: apache/skywalking-ui:9.4.0
container_name: skywalking-ui
depends_on:
- oap
restart: always
ports:
- "8080:8080"
environment:
- SW_OAP_ADDRESS=http://oap:12800
networks:
- skywalking-net
volumes:
es_data:
networks:
skywalking-net:
driver: bridge
启动命令:
docker-compose -f docker-compose-skywalking.yml up -d
2.2 手动部署(生产环境)
2.2.1 下载与解压
# 下载SkyWalking
wget https://dlcdn.apache.org/skywalking/9.4.0/apache-skywalking-apm-9.4.0.tar.gz
tar -zxvf apache-skywalking-apm-9.4.0.tar.gz
cd apache-skywalking-apm-bin
# 目录结构
├── bin/ # 启动脚本
├── config/ # 配置文件
├── oap-libs/ # OAP依赖
├── webapp/ # UI应用
└── agent/ # Agent文件
2.2.2 OAP Server配置
# config/application.yml
cluster:
selector: ${SW_CLUSTER:standalone} # 集群模式:standalone, kubernetes, zookeeper
standalone:
core:
selector: ${SW_CORE:default}
default:
# 数据存储设置
storage: ${SW_STORAGE:elasticsearch}
# 采样率
sampleRate: ${SW_SAMPLE_RATE:1000} # 采样率,默认1000=100%
# 慢服务阈值(毫秒)
slowServiceThreshold: ${SW_SERVICE_SLOW_THRESHOLD:2000}
# 最大服务数量
serviceLimit: ${SW_SERVICE_LIMIT:1000}
# 最大端点数量
endpointLimit: ${SW_ENDPOINT_LIMIT:1000}
storage:
selector: ${SW_STORAGE:elasticsearch}
elasticsearch:
namespace: ${SW_NAMESPACE:"skywalking"}
clusterNodes: ${SW_STORAGE_ES_CLUSTER_NODES:localhost:9200}
user: ${SW_ES_USER:""}
password: ${SW_ES_PASSWORD:""}
indexShardsNumber: ${SW_STORAGE_ES_INDEX_SHARDS_NUMBER:2}
indexReplicasNumber: ${SW_STORAGE_ES_INDEX_REPLICAS_NUMBER:0}
# 索引数据保留策略
recordDataTTL: ${SW_STORAGE_ES_RECORD_DATA_TTL:3} # 记录数据保留天数
minuteMetricsDataTTL: ${SW_STORAGE_ES_MINUTE_METRIC_DATA_TTL:5} # 分钟指标保留天数
hourMetricsDataTTL: ${SW_STORAGE_ES_HOUR_METRIC_DATA_TTL:6} # 小时指标保留天数
dayMetricsDataTTL: ${SW_STORAGE_ES_DAY_METRIC_DATA_TTL:10} # 天指标保留天数
monthMetricsDataTTL: ${SW_STORAGE_ES_MONTH_METRIC_DATA_TTL:1} # 月指标保留月数
receiver-sharing-server:
selector: ${SW_RECEIVER_SHARING_SERVER:default}
default:
# gRPC设置
gRPCHost: ${SW_GRPC_HOST:0.0.0.0}
gRPCPort: ${SW_GRPC_PORT:11800}
maxConcurrentCallsPerConnection: ${SW_GRPC_MAX_CONCURRENT_CALL:10}
maxMessageSize: ${SW_GRPC_MAX_MESSAGE_SIZE:10485760}
# REST设置
restHost: ${SW_REST_HOST:0.0.0.0}
restPort: ${SW_REST_PORT:12800}
restContextPath: ${SW_REST_CONTEXT_PATH:/}
restMaxThreads: ${SW_REST_MAX_THREADS:200}
restIdleTimeOut: ${SW_REST_IDLE_TIMEOUT:30000}
restAcceptQueueSize: ${SW_REST_QUEUE_SIZE:0}
receiver-trace:
selector: ${SW_RECEIVER_TRACE:default}
default:
# 采样设置
sampleRate: ${SW_TRACE_SAMPLE_RATE:10000} # 采样率,10000=100%
# 慢跟踪阈值(毫秒)
slowTraceSegmentThreshold: ${SW_SLOW_TRACE_SEGMENT_THRESHOLD:1000}
# JVM监控配置
receiver-jvm:
selector: ${SW_RECEIVER_JVM:default}
# 服务实例属性配置
receiver-profile:
selector: ${SW_RECEIVER_PROFILE:default}
# 日志收集配置
receiver-log:
selector: ${SW_RECEIVER_LOG:default}
default:
# 日志分析器
analyzer: ${SW_LOG_ENHANCE_ANALYZER:""}
# 日志采样率
samplingRate: ${SW_LOG_SAMPLING_RATE:10000}
2.2.3 启动OAP Server
# Linux
cd apache-skywalking-apm-bin
./bin/oapService.sh
# 或者前台启动
./bin/oapServiceInit.sh start
# 查看日志
tail -f logs/skywalking-oap-server.log
2.2.4 启动UI
cd apache-skywalking-apm-bin
./bin/webappService.sh
# 或者前台启动
./bin/webappServiceInit.sh start
# 访问:http://localhost:8080
三、应用集成(Java Agent)
3.1 无侵入集成方式
3.1.1 启动参数方式
# 启动Java应用时添加Agent
java -javaagent:/path/to/skywalking-agent/skywalking-agent.jar \
-Dskywalking.agent.service_name=order-service \
-Dskywalking.collector.backend_service=192.168.1.100:11800 \
-jar order-service.jar
# 完整参数示例
java -javaagent:/opt/skywalking/agent/skywalking-agent.jar \
-Dskywalking.agent.service_name=order-service \
-Dskywalking.agent.instance_name=order-service-1 \
-Dskywalking.agent.sample_n_per_3_secs=10 \
-Dskywalking.collector.backend_service=192.168.1.100:11800 \
-Dskywalking.logging.file_name=skywalking-agent.log \
-Dskywalking.logging.level=INFO \
-jar /app/order-service.jar
3.1.2 Docker容器集成
# Dockerfile
FROM openjdk:11-jre-slim
# 安装SkyWalking Agent
RUN mkdir -p /opt/skywalking/agent
COPY skywalking-agent /opt/skywalking/agent/
# 应用JAR包
COPY target/order-service.jar /app/order-service.jar
# 启动命令
ENTRYPOINT ["java", \
"-javaagent:/opt/skywalking/agent/skywalking-agent.jar", \
"-Dskywalking.agent.service_name=order-service", \
"-Dskywalking.agent.instance_name=${HOSTNAME}", \
"-Dskywalking.collector.backend_service=${SW_OAP_ADDRESS:192.168.1.100:11800}", \
"-jar", "/app/order-service.jar"]
3.1.3 Kubernetes集成
# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: order-service
labels:
app: order-service
spec:
replicas: 3
selector:
matchLabels:
app: order-service
template:
metadata:
labels:
app: order-service
spec:
initContainers:
# 初始化容器,下载Agent
- name: skywalking-agent-init
image: busybox:latest
command: ['sh', '-c', 'wget -O /tmp/agent.tar.gz https://download-link && tar -zxvf /tmp/agent.tar.gz -C /agent']
volumeMounts:
- name: skywalking-agent
mountPath: /agent
containers:
- name: order-service
image: order-service:1.0.0
env:
- name: SW_AGENT_NAME
value: "order-service"
- name: SW_AGENT_COLLECTOR_BACKEND_SERVICES
value: "skywalking-oap:11800"
volumeMounts:
- name: skywalking-agent
mountPath: /opt/skywalking/agent
command: ["java"]
args:
- "-javaagent:/opt/skywalking/agent/skywalking-agent.jar"
- "-Dskywalking.agent.service_name=$(SW_AGENT_NAME)"
- "-Dskywalking.collector.backend_service=$(SW_AGENT_COLLECTOR_BACKEND_SERVICES)"
- "-jar"
- "/app/order-service.jar"
volumes:
- name: skywalking-agent
emptyDir: {}
3.2 Agent配置文件详解
# agent/config/agent.config
# 服务名称
agent.service_name=${SW_AGENT_NAME:Your_Application_Name}
# 实例名称(同一服务的不同实例)
agent.instance_name=${SW_AGENT_INSTANCE_NAME:Your_Application_Instance_Name}
# 采样率(每3秒采样数)
agent.sample_n_per_3_secs=${SW_AGENT_SAMPLE:10}
# 收集器地址
collector.backend_service=${SW_AGENT_COLLECTOR_BACKEND_SERVICES:127.0.0.1:11800}
# 日志配置
logging.file_name=${SW_LOGGING_FILE_NAME:skywalking-agent.log}
logging.level=${SW_LOGGING_LEVEL:INFO}
logging.dir=${SW_LOGGING_DIR:logs}
# 忽略路径(不追踪的URL)
plugin.trace.ignore_path=${SW_IGNORE_PATH:/actuator/**,/health,/metrics}
# 慢方法阈值(毫秒)
plugin.springmvc.use_qualified_name_as_endpoint_name=${SW_PLUGIN_SPRINGMVC_USE_QUALIFIED_NAME_AS_ENDPOINT_NAME:false}
plugin.http.http_params_length_threshold=${SW_PLUGIN_HTTP_PARAMS_LENGTH_THRESHOLD:1024}
# SQL参数记录
plugin.jdbc.trace_sql_parameters=${SW_JDBC_TRACE_SQL_PARAMETERS:false}
plugin.jdbc.sql_parameters_max_length=${SW_JDBC_SQL_PARAMETERS_MAX_LENGTH:512}
# 线程池监控
plugin.customize.enhance_thread_pool_executor=${SW_ENHANCE_THREAD_POOL_EXECUTOR:true}
plugin.customize.thread_pool_executor_class_prefix=${SW_THREAD_POOL_EXECUTOR_CLASS_PREFIX:org.springframework.scheduling.concurrent}
# 日志集成
plugin.toolkit.log.grpc.reporter.server_host=${SW_GRPC_LOG_SERVER_HOST:127.0.0.1}
plugin.toolkit.log.grpc.reporter.server_port=${SW_GRPC_LOG_SERVER_PORT:11800}
plugin.toolkit.log.grpc.reporter.max_message_size=${SW_GRPC_LOG_MAX_MESSAGE_SIZE:1048576}
plugin.toolkit.log.grpc.reporter.upstream_timeout=${SW_GRPC_LOG_UPSTREAM_TIMEOUT:30}
# 配置文件热更新
agent.config.autofetch_period=${SW_AGENT_CONFIG_AUTOFETCH_PERIOD:60}
agent.dynamic_config.scheduler_check_interval=${SW_AGENT_DYNAMIC_CONFIG_SCHEDULER_CHECK_INTERVAL:60}
3.3 Spring Boot集成示例
3.3.1 Maven配置
<!-- pom.xml -->
<properties>
<skywalking.version>9.0.0</skywalking.version>
</properties>
<dependencies>
<!-- SkyWalking工具包(可选,用于手动埋点) -->
<dependency>
<groupId>org.apache.skywalking</groupId>
<artifactId>apm-toolkit-trace</artifactId>
<version>${skywalking.version}</version>
</dependency>
<dependency>
<groupId>org.apache.skywalking</groupId>
<artifactId>apm-toolkit-logback-1.x</artifactId>
<version>${skywalking.version}</version>
</dependency>
<dependency>
<groupId>org.apache.skywalking</groupId>
<artifactId>apm-toolkit-micrometer-registry</artifactId>
<version>${skywalking.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 生成启动脚本 -->
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<executable>true</executable>
<jvmArguments>
-javaagent:/opt/skywalking/agent/skywalking-agent.jar
-Dskywalking.agent.service_name=${project.artifactId}
-Dskywalking.collector.backend_service=${skywalking.oap.host}:${skywalking.oap.port}
</jvmArguments>
</configuration>
</plugin>
</plugins>
</build>
3.3.2 应用配置
# application.yml
skywalking:
agent:
service-name: ${spring.application.name}
collector:
backend-service: ${SW_AGENT_COLLECTOR_BACKEND_SERVICES:localhost:11800}
# 采样率配置
sample:
n-per-3-secs: 10
# 忽略路径
ignore-suffix: .jpg,.jpeg,.png,.gif,.css,.js,.html,.ico
spring:
application:
name: order-service
# Sleuth集成(二选一,不要同时使用)
sleuth:
enabled: false # 禁用Sleuth,使用SkyWalking
3.3.3 Logback集成
<!-- logback-spring.xml -->
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<!-- 引入SkyWalking的logback配置 -->
<include resource="org/apache/skywalking/apm/toolkit/log/logback-1.x/defaults.xml"/>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder class="ch.qos.logback.core.encoder.LayoutWrappingEncoder">
<!-- 使用SkyWalking的PatternLogbackLayout -->
<layout class="org.apache.skywalking.apm.toolkit.log.logback.v1.x.TraceIdPatternLogbackLayout">
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%tid] [%thread] %-5level %logger{36} - %msg%n</pattern>
</layout>
</encoder>
</appender>
<appender name="GRPC" class="org.apache.skywalking.apm.toolkit.log.logback.v1.x.log.GRPCLogClientAppender">
<encoder class="ch.qos.logback.core.encoder.LayoutWrappingEncoder">
<layout class="org.apache.skywalking.apm.toolkit.log.logback.v1.x.TraceIdPatternLogbackLayout">
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%tid] [%thread] %-5level %logger{36} - %msg%n</pattern>
</layout>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="GRPC"/> <!-- 发送日志到SkyWalking OAP -->
</root>
</configuration>
四、手动埋点与自定义追踪
4.1 TraceContext API使用
import org.apache.skywalking.apm.toolkit.trace.ActiveSpan;
import org.apache.skywalking.apm.toolkit.trace.Trace;
import org.apache.skywalking.apm.toolkit.trace.TraceContext;
import org.apache.skywalking.apm.toolkit.trace.Tag;
import org.apache.skywalking.apm.toolkit.trace.Tags;
@Service
@Slf4j
public class OrderService {
/**
* 方法1:@Trace注解自动追踪
*/
@Trace
public Order createOrder(CreateOrderRequest request) {
// 自动记录方法执行时间和出入参
return processOrder(request);
}
/**
* 方法2:手动记录标签
*/
@Trace
@Tag(key = "order.id", value = "arg[0]") // 记录第一个参数
@Tag(key = "order.amount", value = "returnedObj.totalAmount") // 记录返回值属性
public Order getOrderById(Long orderId) {
// 业务逻辑
Order order = orderRepository.findById(orderId);
// 手动添加标签
ActiveSpan.tag("order.status", order.getStatus());
ActiveSpan.tag("user.id", String.valueOf(order.getUserId()));
return order;
}
/**
* 方法3:自定义操作名
*/
@Trace(operationName = "OrderService.createOrderAsync")
public CompletableFuture<Order> createOrderAsync(CreateOrderRequest request) {
return CompletableFuture.supplyAsync(() -> {
// 异步任务中获取TraceId
String traceId = TraceContext.traceId();
log.info("Async task traceId: {}", traceId);
return processOrder(request);
});
}
/**
* 方法4:记录错误信息
*/
@Trace
public void processPayment(Order order) {
try {
// 支付逻辑
paymentService.pay(order);
// 记录成功标签
ActiveSpan.tag("payment.result", "success");
} catch (Exception e) {
// 记录错误
ActiveSpan.error(e);
ActiveSpan.tag("payment.result", "failed");
ActiveSpan.tag("payment.error", e.getMessage());
throw new PaymentException("支付失败", e);
}
}
/**
* 方法5:手动创建本地跨度
*/
public void batchProcessOrders(List<Order> orders) {
// 获取TraceId
String traceId = TraceContext.traceId();
log.info("Batch process started, traceId: {}", traceId);
for (int i = 0; i < orders.size(); i++) {
Order order = orders.get(i);
// 创建本地跨度
ActiveSpan span = ActiveSpan.createLocalSpan("OrderService.processSingleOrder");
try {
// 记录订单信息
span.tag("order.index", String.valueOf(i));
span.tag("order.id", String.valueOf(order.getId()));
// 业务处理
processSingleOrder(order);
} finally {
// 必须结束跨度
span.close();
}
}
}
/**
* 方法6:跨线程传递Trace上下文
*/
public void asyncProcessWithTrace(Order order) {
// 获取当前Trace上下文
ContextCarrier carrier = new ContextCarrier();
// 创建异步任务
CompletableFuture.runAsync(() -> {
try {
// 在子线程中恢复Trace上下文
ContextManager.extract(carrier);
// 执行业务逻辑
processOrder(order);
} finally {
// 清理上下文
ContextManager.stopSpan();
}
});
}
/**
* 方法7:跨服务追踪
*/
@Trace
public Order createOrderWithCrossService(CreateOrderRequest request) {
// 创建订单
Order order = createOrder(request);
// 准备跨服务追踪的载体
ContextCarrier carrier = new ContextCarrier();
// 跨服务调用(HTTP示例)
HttpHeaders headers = new HttpHeaders();
// 注入Trace上下文到HTTP头
CarrierItem next = carrier.items();
while (next.hasNext()) {
next = next.next();
headers.add(next.getHeadKey(), next.getHeadValue());
}
// 调用库存服务
restTemplate.postForEntity(
"http://inventory-service/inventory/reduce",
new HttpEntity<>(request, headers),
Void.class
);
return order;
}
}
4.2 自定义拦截器
@Component
@Aspect
@Slf4j
public class SkyWalkingAspect {
/**
* 拦截所有Controller方法
*/
@Around("@within(org.springframework.web.bind.annotation.RestController) " +
"|| @within(org.springframework.stereotype.Controller)")
public Object traceController(ProceedingJoinPoint joinPoint) throws Throwable {
String className = joinPoint.getTarget().getClass().getSimpleName();
String methodName = joinPoint.getSignature().getName();
String operationName = className + "." + methodName;
// 创建本地跨度
ActiveSpan span = ActiveSpan.createLocalSpan(operationName);
try {
// 记录请求参数
Object[] args = joinPoint.getArgs();
if (args != null && args.length > 0) {
for (int i = 0; i < args.length; i++) {
if (args[i] != null) {
span.tag("param." + i, args[i].toString());
}
}
}
// 执行原方法
Object result = joinPoint.proceed();
// 记录返回值类型
span.tag("return.type", joinPoint.getSignature().getReturnType().getSimpleName());
return result;
} catch (Exception e) {
// 记录异常
span.error(e);
span.tag("exception.type", e.getClass().getName());
span.tag("exception.message", e.getMessage());
throw e;
} finally {
// 结束跨度
span.close();
}
}
/**
* 拦截数据库操作
*/
@Around("execution(* org.springframework.data.repository.Repository+.*(..))")
public Object traceRepository(ProceedingJoinPoint joinPoint) throws Throwable {
String methodName = joinPoint.getSignature().getName();
// 只追踪写操作
if (methodName.startsWith("save") || methodName.startsWith("delete") ||
methodName.startsWith("update") || methodName.startsWith("insert")) {
ActiveSpan span = ActiveSpan.createLocalSpan("Repository." + methodName);
try {
return joinPoint.proceed();
} finally {
span.close();
}
}
return joinPoint.proceed();
}
/**
* 拦截外部服务调用
*/
@Around("execution(* com.example.service.*.*(..))")
public Object traceService(ProceedingJoinPoint joinPoint) throws Throwable {
String className = joinPoint.getTarget().getClass().getSimpleName();
String methodName = joinPoint.getSignature().getName();
String operationName = className + "." + methodName;
ActiveSpan span = ActiveSpan.createLocalSpan(operationName);
try {
long start = System.currentTimeMillis();
Object result = joinPoint.proceed();
long duration = System.currentTimeMillis() - start;
// 记录执行时间
span.tag("duration.ms", String.valueOf(duration));
// 慢方法告警
if (duration > 1000) { // 超过1秒
span.tag("slow.method", "true");
log.warn("Slow method detected: {} took {}ms", operationName, duration);
}
return result;
} catch (Exception e) {
span.error(e);
throw e;
} finally {
span.close();
}
}
}
4.3 HTTP客户端拦截器
@Component
public class SkyWalkingHttpInterceptor implements ClientHttpRequestInterceptor {
@Override
public ClientHttpResponse intercept(HttpRequest request, byte[] body,
ClientHttpRequestExecution execution) throws IOException {
// 创建跨服务追踪的载体
ContextCarrier carrier = new ContextCarrier();
// 注入Trace上下文到HTTP头
AbstractSpan span = ContextManager.createExitSpan(
request.getURI().getPath(),
carrier,
request.getURI().getHost() + ":" + request.getURI().getPort()
);
try {
// 设置HTTP头
HttpHeaders headers = request.getHeaders();
CarrierItem next = carrier.items();
while (next.hasNext()) {
next = next.next();
headers.add(next.getHeadKey(), next.getHeadValue());
}
// 添加自定义头
headers.add("X-Trace-Id", TraceContext.traceId());
// 执行请求
ClientHttpResponse response = execution.execute(request, body);
// 记录响应状态
span.tag("http.status", String.valueOf(response.getRawStatusCode()));
span.tag("http.method", request.getMethodValue());
span.tag("http.url", request.getURI().toString());
return response;
} catch (Exception e) {
// 记录错误
span.error(e);
span.tag("http.error", e.getMessage());
throw e;
} finally {
// 结束跨度
ContextManager.stopSpan();
}
}
}
// 配置RestTemplate
@Configuration
public class RestTemplateConfig {
@Bean
public RestTemplate restTemplate() {
RestTemplate restTemplate = new RestTemplate();
// 添加拦截器
List<ClientHttpRequestInterceptor> interceptors = new ArrayList<>();
interceptors.add(new SkyWalkingHttpInterceptor());
restTemplate.setInterceptors(interceptors);
return restTemplate;
}
}
五、告警配置
5.1 告警规则配置
# config/alarm-settings.yml
rules:
# 服务响应时间告警
service_resp_time_rule:
metrics-name: service_resp_time
op: ">"
threshold: 1000 # 响应时间阈值(毫秒)
period: 10 # 检查周期(分钟)
count: 3 # 触发次数
silence-period: 5 # 静默期(分钟)
message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes.
# 服务成功率告警
service_sla_rule:
metrics-name: service_sla
op: "<"
threshold: 80 # 成功率阈值(百分比)
period: 10
count: 2
silence-period: 3
message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes.
# 服务响应时间百分位告警
service_percentile_rule:
metrics-name: service_percentile
op: ">"
threshold: 1000,1000,1000,1000,1000 # P50,P75,P90,P95,P99
period: 10
count: 3
silence-period: 10
message: Percentile response time of service {name} alarm.
# 端点响应时间告警
endpoint_resp_time_rule:
metrics-name: endpoint_resp_time
op: ">"
threshold: 1000
period: 10
count: 2
silence-period: 5
message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes.
# 数据库慢查询告警
database_access_resp_time_rule:
metrics-name: database_access_resp_time
threshold: 1000
op: ">"
period: 10
count: 2
message: Response time of database access {name} is more than 1000ms.
# JVM堆内存使用告警
jvm_old_gc_count_rule:
metrics-name: jvm_old_gc_count
op: ">"
threshold: 10
period: 5
count: 2
message: JVM old GC count of service instance {name} is more than 10 times in 2 minutes of last 5 minutes.
# 业务自定义告警
business_order_create_rule:
metrics-name: business_order_create_error
op: ">"
threshold: 10
period: 5
count: 1
message: Order create error count is more than 10 in last 5 minutes.
# 自定义端点告警
endpoint_custom_rule:
metrics-name: endpoint_custom
include-names:
- /order/create # 只监控特定端点
- /payment/process
op: ">"
threshold: 2000
period: 10
count: 3
message: Custom endpoint {name} response time is more than 2000ms.
# 告警接收人配置
webhooks:
- http://192.168.1.200:8080/webhook # 自定义webhook接收告警
# 钉钉机器人配置
dingtalkHooks:
textTemplate: |-
{
"msgtype": "markdown",
"markdown": {
"title": "SkyWalking告警",
"text": "### SkyWalking告警通知\n- **告警规则**: %s\n- **告警服务**: %s\n- **告警时间**: %s\n- **详细信息**: %s"
},
"at": {
"atMobiles": [
"13800138000"
],
"isAtAll": false
}
}
webhooks:
- url: https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN
secret: YOUR_SECRET
# 企业微信机器人配置
wechatHooks:
textTemplate: |-
{
"msgtype": "markdown",
"markdown": {
"content": "**SkyWalking告警**\n> **告警规则**: %s\n> **告警服务**: %s\n> **告警时间**: %s\n> **详细信息**: %s"
}
}
webhooks:
- url: https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=YOUR_KEY
# 飞书机器人配置
feishuHooks:
textTemplate: |-
{
"msg_type": "post",
"content": {
"post": {
"zh_cn": {
"title": "SkyWalking告警",
"content": [
[
{
"tag": "text",
"text": "告警规则: %s\n"
},
{
"tag": "text",
"text": "告警服务: %s\n"
},
{
"tag": "text",
"text": "告警时间: %s\n"
},
{
"tag": "text",
"text": "详细信息: %s"
}
]
]
}
}
}
}
webhooks:
- url: https://open.feishu.cn/open-apis/bot/v2/hook/YOUR_TOKEN
# 自定义webhook接收器
httpHooks:
textTemplate: |-
{
"alarmType": "skywalking",
"alarmRule": "%s",
"alarmService": "%s",
"alarmTime": "%s",
"alarmDetail": "%s",
"alarmLevel": "warning"
}
webhooks:
- url: http://192.168.1.201:9090/alerts/receive
secret: YOUR_SECRET_KEY
5.2 自定义告警接收器
@RestController
@RequestMapping("/webhook")
@Slf4j
public class SkyWalkingAlertReceiver {
@PostMapping("/receive")
public ResponseEntity<Void> receiveAlert(@RequestBody List<SkyWalkingAlert> alerts) {
for (SkyWalkingAlert alert : alerts) {
log.info("Received SkyWalking alert: {}", alert);
// 处理告警
processAlert(alert);
// 发送到不同的告警渠道
sendToDifferentChannels(alert);
}
return ResponseEntity.ok().build();
}
private void processAlert(SkyWalkingAlert alert) {
// 解析告警数据
String alarmMessage = alert.getAlarmMessage();
long startTime = alert.getStartTime();
List<SkyWalkingAlert.Scope> scope = alert.getScope();
// 根据告警类型处理
switch (alert.getAlarmType()) {
case "SERVICE_RESP_TIME":
handleResponseTimeAlert(alert);
break;
case "SERVICE_SLA":
handleSlaAlert(alert);
break;
case "DATABASE_ACCESS":
handleDatabaseAlert(alert);
break;
default:
log.warn("Unknown alert type: {}", alert.getAlarmType());
}
}
private void sendToDifferentChannels(SkyWalkingAlert alert) {
// 1. 发送到邮件
sendEmailAlert(alert);
// 2. 发送到短信(仅重要告警)
if (isCriticalAlert(alert)) {
sendSmsAlert(alert);
}
// 3. 发送到IM
sendImAlert(alert);
// 4. 记录到数据库
saveAlertToDatabase(alert);
}
// SkyWalking告警数据结构
@Data
public static class SkyWalkingAlert {
private int scopeId;
private String name;
private String id0;
private String id1;
private String alarmMessage;
private long startTime;
private List<Scope> scope;
private List<Tag> tags;
@Data
public static class Scope {
private String key;
private String value;
}
@Data
public static class Tag {
private String key;
private String value;
}
}
}
六、性能优化与最佳实践
6.1 生产环境配置优化
# agent/config/agent.config(生产环境)
# 采样率设置(根据流量调整)
agent.sample_n_per_3_secs=${SW_AGENT_SAMPLE:100} # 生产环境适当降低
# 缓冲区设置
buffer.channel_size=${SW_BUFFER_CHANNEL_SIZE:5}
buffer.max_buffer_size=${SW_BUFFER_MAX_SIZE:300}
# 网络设置
collector.grpc_channel_check_interval=${SW_GRPC_CHANNEL_CHECK_INTERVAL:30}
collector.get_profile_task_interval=${SW_GET_PROFILE_TASK_INTERVAL:20}
collector.get_agent_dynamic_config_interval=${SW_GET_AGENT_DYNAMIC_CONFIG_INTERVAL:20}
# 线程池设置
plugin.mongodb.trace_param=${SW_MONGODB_TRACE_PARAM:false}
plugin.elasticsearch.trace_dsl=${SW_ELASTICSEARCH_TRACE_DSL:false}
# 忽略特定路径
plugin.trace.ignore_path=${SW_IGNORE_PATH:/actuator/**,/health,/metrics,/swagger-ui/**,/v2/api-docs}
# 慢SQL记录
plugin.mysql.trace_sql_parameters=${SW_MYSQL_TRACE_SQL_PARAMETERS:false}
plugin.mysql.sql_parameters_max_length=${SW_MYSQL_SQL_PARAMETERS_MAX_LENGTH:512}
plugin.mysql.sql_body_max_length=${SW_MYSQL_SQL_BODY_MAX_LENGTH:2048}
6.2 监控指标说明
# SkyWalking监控的关键指标
1. 服务指标:
- service_resp_time: 服务响应时间
- service_sla: 服务成功率
- service_cpm: 每分钟调用次数
- service_apdex: 服务满意度指数
2. 端点指标:
- endpoint_resp_time: 端点响应时间
- endpoint_cpm: 端点每分钟调用次数
- endpoint_sla: 端点成功率
3. 实例指标:
- instance_jvm_cpu: JVM CPU使用率
- instance_jvm_memory: JVM内存使用
- instance_jvm_gc_time: GC时间
- instance_jvm_gc_count: GC次数
4. 拓扑指标:
- service_relation_client_cpm: 服务作为客户端的调用次数
- service_relation_server_cpm: 服务作为服务端的调用次数
- service_relation_client_call_sla: 客户端调用成功率
- service_relation_server_call_sla: 服务端调用成功率
6.3 故障排查
# 1. 检查Agent日志
tail -f /path/to/logs/skywalking-agent.log
# 常见错误:
# - Connection refused: 检查OAP Server是否启动
# - Unauthorized: 检查认证配置
# - Buffer overflow: 调整缓冲区大小
# 2. 检查OAP Server日志
tail -f /path/to/logs/skywalking-oap-server.log
# 3. 检查存储连接
# 如果是Elasticsearch:
curl http://localhost:9200/_cluster/health
# 4. 检查UI连接
curl http://localhost:8080
# 5. 查看Agent配置
java -javaagent:/path/to/agent/skywalking-agent.jar -Dskywalking.agent.service_name=test -version
# 6. 手动测试数据发送
# 使用tcpdump或wireshark监控11800端口数据
6.4 集群部署配置
# cluster模式配置(config/application.yml)
cluster:
selector: ${SW_CLUSTER:kubernetes}
kubernetes:
namespace: ${SW_NAMESPACE:default}
labelSelector: ${SW_LABEL:app=skywalking}
uidEnvName: ${SW_UID:SKYWALKING_COLLECTOR_UID}
# 或者使用Zookeeper
zookeeper:
hostPort: ${SW_CLUSTER_ZK_HOST_PORT:localhost:2181}
sessionTimeout: ${SW_CLUSTER_ZK_SESSION_TIMEOUT:100000}
# OAP Server内存优化
# 修改bin/oapService.sh
JAVA_OPTS="-Xms4g -Xmx4g -XX:+UseG1GC -XX:MaxGCPauseMillis=100"
# 多实例负载均衡
# 使用Nginx做OAP Server负载均衡
upstream skywalking_oap {
server 192.168.1.100:11800;
server 192.168.1.101:11800;
server 192.168.1.102:11800;
}
server {
listen 11800;
location / {
proxy_pass http://skywalking_oap;
}
}
七、与其他系统集成
7.1 与Prometheus集成
# 启用SkyWalking的Prometheus exporter
# 在config/application.yml中添加:
prometheus-fetcher:
selector: ${SW_PROMETHEUS_FETCHER:default}
default:
active: ${SW_PROMETHEUS_FETCHER_ACTIVE:true}
duration: ${SW_PROMETHEUS_FETCHER_DURATION:60}
# 访问指标:http://localhost:1234/metrics
7.2 与Grafana集成
-
添加SkyWalking数据源:
- Type: SkyWalking
- URL: http://skywalking-oap:12800
- Version: v9
-
导入SkyWalking Dashboard:
- Dashboard ID: 13199 (官方模板)
7.3 与Spring Cloud集成
// Spring Cloud Gateway集成
@Component
public class SkyWalkingGatewayFilter implements GlobalFilter, Ordered {
@Override
public Mono<Void> filter(ServerWebExchange exchange, GatewayFilterChain chain) {
// 从请求头中提取Trace信息
ServerHttpRequest request = exchange.getRequest();
HttpHeaders headers = request.getHeaders();
ContextCarrier carrier = new ContextCarrier();
CarrierItem next = carrier.items();
while (next.hasNext()) {
next = next.next();
String value = headers.getFirst(next.getHeadKey());
if (value != null) {
next.setHeadValue(value);
}
}
// 创建入口跨度
AbstractSpan span = ContextManager.createEntrySpan(
request.getPath().value(),
carrier
);
span.setComponent(ComponentsDefine.SPRING_MVC);
Tags.URL.set(span, request.getURI().toString());
Tags.HTTP.METHOD.set(span, request.getMethodValue());
return chain.filter(exchange).doFinally(signalType -> {
// 结束跨度
ContextManager.stopSpan();
});
}
@Override
public int getOrder() {
return Ordered.HIGHEST_PRECEDENCE;
}
}
八、总结
SkyWalking作为国产优秀的APM系统,具有以下优势:
- 无侵入性:通过Java Agent实现字节码增强
- 性能损耗低:平均性能损耗约3-5%
- 功能全面:支持链路追踪、性能监控、日志收集、告警等
- 扩展性强:支持多语言、多存储、多部署方式
- 社区活跃:Apache顶级项目,持续更新
通过合理的配置和集成,SkyWalking可以成为微服务架构中不可或缺的可观测性组件。