34.3 企業級監控與維護
學習如何建立企業級監控和維護體系,確保 Claude Code 在生產環境中的穩定執行和持續最佳化。
34.3.1 监控体系概述
监控的重要性
企業級監控對於 Claude Code 部署至關重要,它可以幫助:
- 確保可用性:及時發現和解決服務中斷
- 最佳化效能:識別效能瓶頸並最佳化資源使用
- 安全防護:檢測異常行為和安全威脅
- 成本控制:監控使用情況和資源消耗
- 合規審計:滿足企業合規要求
監控維度
python
# 企业级监控维度
MONITORING_DIMENSIONS = {
"可用性监控": {
"指标": ["服务状态", "响应时间", "错误率"],
"目标": "99.9% 可用性"
},
"性能监控": {
"指标": ["API 延迟", "令牌使用", "并发连接"],
"目标": "P95 延迟 < 2s"
},
"资源监控": {
"指标": ["CPU 使用率", "内存使用", "磁盘 I/O", "网络带宽"],
"目标": "资源利用率 < 80%"
},
"安全监控": {
"指标": ["异常访问", "权限违规", "数据泄露"],
"目标": "零安全事件"
},
"成本监控": {
"指标": ["API 调用成本", "令牌成本", "基础设施成本"],
"目标": "成本控制在预算内"
}
}34.3.2 指标收集
Prometheus 配置
yaml
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# Claude Code API 监控
- job_name: 'claude-code-api'
static_configs:
- targets: ['localhost:8080']
metrics_path: '/metrics'
scrape_interval: 10s
# LLM 网关监控
- job_name: 'llm-gateway'
static_configs:
- targets: ['localhost:4000']
metrics_path: '/metrics'
scrape_interval: 10s
# 开发容器监控
- job_name: 'dev-containers'
static_configs:
- targets: ['localhost:9323']
metrics_path: '/metrics'
scrape_interval: 30s
# 沙箱监控
- job_name: 'sandbox'
static_configs:
- targets: ['localhost:9100']
metrics_path: '/metrics'
scrape_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']自定義指標匯出器
python
# claude_code_exporter.py
from prometheus_client import start_http_server, Gauge, Counter, Histogram
import time
import json
import requests
from datetime import datetime
# 定义指标
api_requests_total = Counter(
'claude_code_api_requests_total',
'Total API requests',
['endpoint', 'status']
)
api_latency = Histogram(
'claude_code_api_latency_seconds',
'API request latency',
['endpoint']
)
active_sessions = Gauge(
'claude_code_active_sessions',
'Number of active sessions'
)
tokens_used = Counter(
'claude_code_tokens_used_total',
'Total tokens used',
['model', 'type']
)
cost_incurred = Gauge(
'claude_code_cost_usd',
'Total cost incurred in USD'
)
class ClaudeCodeMetricsCollector:
def __init__(self, api_base_url='http://localhost:8080'):
self.api_base_url = api_base_url
self.start_time = datetime.now()
def collect_api_metrics(self):
"""收集 API 指标"""
try:
# 获取 API 状态
response = requests.get(f'{self.api_base_url}/health')
if response.status_code == 200:
data = response.json()
# 更新活跃会话数
active_sessions.set(data.get('active_sessions', 0))
# 更新令牌使用
tokens = data.get('tokens_used', {})
for model, count in tokens.items():
tokens_used.labels(model=model, type='input').inc(count.get('input', 0))
tokens_used.labels(model=model, type='output').inc(count.get('output', 0))
# 更新成本
cost_incurred.set(data.get('total_cost', 0.0))
except Exception as e:
print(f"Error collecting API metrics: {e}")
def collect_performance_metrics(self):
"""收集性能指标"""
try:
# 测试 API 延迟
start_time = time.time()
response = requests.get(f'{self.api_base_url}/health')
latency = time.time() - start_time
# 记录延迟
api_latency.labels(endpoint='/health').observe(latency)
# 记录请求
api_requests_total.labels(
endpoint='/health',
status=response.status_code
).inc()
except Exception as e:
print(f"Error collecting performance metrics: {e}")
def collect_sandbox_metrics(self):
"""收集沙箱指标"""
try:
response = requests.get(f'{self.api_base_url}/sandbox/status')
if response.status_code == 200:
data = response.json()
# 沙箱违规计数
violations = data.get('violations', 0)
# 可以添加更多沙箱相关指标
except Exception as e:
print(f"Error collecting sandbox metrics: {e}")
def run(self, interval=10):
"""运行指标收集器"""
start_http_server(9100)
print("Metrics server started on port 9100")
while True:
self.collect_api_metrics()
self.collect_performance_metrics()
self.collect_sandbox_metrics()
time.sleep(interval)
if __name__ == '__main__':
collector = ClaudeCodeMetricsCollector()
collector.run()日誌收集配置
yaml
# filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/claude-code/*.log
fields:
service: claude-code
environment: production
fields_under_root: true
- type: log
enabled: true
paths:
- /var/log/llm-gateway/*.log
fields:
service: llm-gateway
environment: production
fields_under_root: true
- type: log
enabled: true
paths:
- /var/log/claude-sandbox/*.log
fields:
service: claude-sandbox
environment: production
fields_under_root: true
output.elasticsearch:
hosts: ["elasticsearch:9200"]
index: "claude-code-%{+yyyy.MM.dd}"
setup.kibana:
host: "kibana:5601"
processors:
- add_host_metadata: ~
- add_cloud_metadata: ~34.3.3 告警配置
Prometheus 告警规则
yaml
# alert_rules.yml
groups:
- name: claude_code_alerts
interval: 30s
rules:
# 服务可用性告警
- alert: ClaudeCodeServiceDown
expr: up{job="claude-code-api"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Claude Code 服务不可用"
description: "Claude Code API 服务已宕机超过 1 分钟"
# API 错误率告警
- alert: HighAPIErrorRate
expr: |
rate(claude_code_api_requests_total{status=~"5.."}[5m]) /
rate(claude_code_api_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "API 错误率过高"
description: "API 错误率超过 5% (当前: {{ $value }})"
# API 延迟告警
- alert: HighAPILatency
expr: |
histogram_quantile(0.95,
rate(claude_code_api_latency_seconds_bucket[5m])
) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "API 延迟过高"
description: "API P95 延迟超过 2 秒 (当前: {{ $value }}s)"
# 令牌使用告警
- alert: HighTokenUsage
expr: |
rate(claude_code_tokens_used_total[1h]) > 100000
for: 10m
labels:
severity: warning
annotations:
summary: "令牌使用率过高"
description: "令牌使用率超过 100,000/小时 (当前: {{ $value }})"
# 成本告警
- alert: HighCostIncurred
expr: claude_code_cost_usd > 1000
for: 1h
labels:
severity: warning
annotations:
summary: "成本超过阈值"
description: "累计成本超过 $1000 (当前: ${{ $value }})"
# 沙箱违规告警
- alert: SandboxViolations
expr: |
rate(claude_sandbox_violations_total[5m]) > 10
for: 5m
labels:
severity: critical
annotations:
summary: "沙箱违规频繁"
description: "沙箱违规率超过 10/分钟 (当前: {{ $value }})"
# 资源使用告警
- alert: HighCPUUsage
expr: |
rate(process_cpu_seconds_total{job="claude-code-api"}[5m]) > 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高"
description: "CPU 使用率超过 80% (当前: {{ $value }})"
- alert: HighMemoryUsage
expr: |
process_resident_memory_bytes{job="claude-code-api"} /
node_memory_MemTotal_bytes > 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "内存使用率过高"
description: "内存使用率超过 80% (当前: {{ $value }})"Alertmanager 配置
yaml
# alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: false
- match:
severity: warning
receiver: 'warning-alerts'
continue: false
receivers:
- name: 'default'
email_configs:
- to: 'team@company.com'
from: 'alerts@company.com'
smarthost: 'smtp.company.com:587'
auth_username: 'alerts@company.com'
auth_password: 'password'
- name: 'critical-alerts'
email_configs:
- to: 'oncall@company.com'
from: 'alerts@company.com'
smarthost: 'smtp.company.com:587'
auth_username: 'alerts@company.com'
auth_password: 'password'
slack_configs:
- api_url: 'https://hooks.slack.com/services/XXX/YYY/ZZZ'
channel: '#critical-alerts'
title: 'Claude Code Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'dev-team@company.com'
from: 'alerts@company.com'
smarthost: 'smtp.company.com:587'
auth_username: 'alerts@company.com'
auth_password: 'password'
slack_configs:
- api_url: 'https://hooks.slack.com/services/XXX/YYY/ZZZ'
channel: '#warnings'
title: 'Claude Code Warning'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname']34.3.4 可视化仪表板
Grafana 仪表板配置
json
{
"dashboard": {
"title": "Claude Code Enterprise Dashboard",
"panels": [
{
"title": "API 请求速率",
"targets": [
{
"expr": "rate(claude_code_api_requests_total[5m])",
"legendFormat": "{{ endpoint }}"
}
],
"type": "graph"
},
{
"title": "API 延迟 (P95)",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(claude_code_api_latency_seconds_bucket[5m]))",
"legendFormat": "P95"
}
],
"type": "graph"
},
{
"title": "活跃会话数",
"targets": [
{
"expr": "claude_code_active_sessions",
"legendFormat": "Sessions"
}
],
"type": "stat"
},
{
"title": "令牌使用率",
"targets": [
{
"expr": "rate(claude_code_tokens_used_total[1h])",
"legendFormat": "{{ model }} - {{ type }}"
}
],
"type": "graph"
},
{
"title": "累计成本",
"targets": [
{
"expr": "claude_code_cost_usd",
"legendFormat": "Cost (USD)"
}
],
"type": "stat"
},
{
"title": "API 错误率",
"targets": [
{
"expr": "rate(claude_code_api_requests_total{status=~\"5..\"}[5m]) / rate(claude_code_api_requests_total[5m])",
"legendFormat": "Error Rate"
}
],
"type": "graph"
},
{
"title": "沙箱违规",
"targets": [
{
"expr": "rate(claude_sandbox_violations_total[5m])",
"legendFormat": "Violations/min"
}
],
"type": "graph"
},
{
"title": "资源使用",
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=\"claude-code-api\"}[5m])",
"legendFormat": "CPU"
},
{
"expr": "process_resident_memory_bytes{job=\"claude-code-api\"} / 1024 / 1024 / 1024",
"legendFormat": "Memory (GB)"
}
],
"type": "graph"
}
]
}
}34.3.5 日志分析
ELK Stack 配置
python
# log_analyzer.py
import elasticsearch
from elasticsearch import Elasticsearch
from datetime import datetime, timedelta
import json
class ClaudeCodeLogAnalyzer:
def __init__(self, es_host='http://localhost:9200'):
self.es = Elasticsearch([es_host])
self.index_pattern = 'claude-code-*'
def search_errors(self, hours=24):
"""搜索错误日志"""
query = {
"query": {
"bool": {
"must": [
{"match": {"level": "ERROR"}},
{"range": {
"@timestamp": {
"gte": (datetime.now() - timedelta(hours=hours)).isoformat()
}
}}
]
}
}
}
response = self.es.search(index=self.index_pattern, body=query)
return response['hits']['hits']
def search_slow_requests(self, threshold_seconds=2, hours=24):
"""搜索慢请求"""
query = {
"query": {
"bool": {
"must": [
{"range": {
"latency": {
"gte": threshold_seconds
}
}},
{"range": {
"@timestamp": {
"gte": (datetime.now() - timedelta(hours=hours)).isoformat()
}
}}
]
}
}
}
response = self.es.search(index=self.index_pattern, body=query)
return response['hits']['hits']
def analyze_user_activity(self, user_id, days=7):
"""分析用户活动"""
query = {
"query": {
"bool": {
"must": [
{"match": {"user_id": user_id}},
{"range": {
"@timestamp": {
"gte": (datetime.now() - timedelta(days=days)).isoformat()
}
}}
]
}
},
"aggs": {
"daily_requests": {
"date_histogram": {
"field": "@timestamp",
"calendar_interval": "day"
},
"aggs": {
"total_tokens": {
"sum": {
"field": "tokens_used"
}
}
}
}
}
}
response = self.es.search(index=self.index_pattern, body=query)
return response
def detect_anomalies(self, hours=1):
"""检测异常"""
# 计算平均请求速率
avg_query = {
"query": {
"range": {
"@timestamp": {
"gte": (datetime.now() - timedelta(hours=hours*2)).isoformat(),
"lt": (datetime.now() - timedelta(hours=hours)).isoformat()
}
}
},
"aggs": {
"avg_rate": {
"avg": {
"script": {
"source": "doc['request_count'].value"
}
}
}
}
}
avg_response = self.es.search(index=self.index_pattern, body=avg_query)
avg_rate = avg_response['aggregations']['avg_rate']['value']
# 检查当前速率是否异常
current_query = {
"query": {
"range": {
"@timestamp": {
"gte": (datetime.now() - timedelta(hours=hours)).isoformat()
}
}
},
"aggs": {
"current_rate": {
"avg": {
"script": {
"source": "doc['request_count'].value"
}
}
}
}
}
current_response = self.es.search(index=self.index_pattern, body=current_query)
current_rate = current_response['aggregations']['current_rate']['value']
# 如果当前速率超过平均值的 2 倍,视为异常
if current_rate > avg_rate * 2:
return {
"anomaly": True,
"avg_rate": avg_rate,
"current_rate": current_rate,
"threshold": avg_rate * 2
}
return {"anomaly": False}
# 使用示例
analyzer = ClaudeCodeLogAnalyzer()
# 搜索错误
errors = analyzer.search_errors(hours=24)
print(f"发现 {len(errors)} 个错误")
# 搜索慢请求
slow_requests = analyzer.search_slow_requests(threshold_seconds=2, hours=24)
print(f"发现 {len(slow_requests)} 个慢请求")
# 分析用户活动
user_activity = analyzer.analyze_user_activity(user_id="user123", days=7)
# 检测异常
anomalies = analyzer.detect_anomalies(hours=1)
if anomalies['anomaly']:
print(f"检测到异常!当前速率: {anomalies['current_rate']}, 阈值: {anomalies['threshold']}")34.3.6 维护策略
定期维护任务
bash
#!/bin/bash
# maintenance.sh
set -e
LOG_DIR="/var/log/claude-code"
BACKUP_DIR="/backup/claude-code"
DATE=$(date +%Y-%m-%d)
echo "=== Claude Code 维护脚本 - $DATE ==="
# 1. 日志轮转
echo "执行日志轮转..."
logrotate -f /etc/logrotate.d/claude-code
# 2. 清理旧日志
echo "清理 30 天前的日志..."
find $LOG_DIR -name "*.log" -mtime +30 -delete
# 3. 备份配置
echo "备份配置文件..."
mkdir -p $BACKUP_DIR/$DATE
cp -r /etc/claude-code $BACKUP_DIR/$DATE/
# 4. 清理缓存
echo "清理缓存..."
rm -rf /tmp/claude-code-cache/*
# 5. 数据库维护(如果使用)
echo "执行数据库维护..."
# psql -U claude -d claude_code -c "VACUUM ANALYZE;"
# 6. 生成维护报告
echo "生成维护报告..."
cat > $BACKUP_DIR/$DATE/maintenance-report.txt << EOF
Claude Code 维护报告
日期: $DATE
日志轮转: 完成
旧日志清理: 完成
配置备份: 完成
缓存清理: 完成
数据库维护: 完成
磁盘使用情况:
$(df -h /var/log/claude-code)
服务状态:
$(systemctl status claude-code --no-pager)
EOF
echo "维护完成!报告已保存到 $BACKUP_DIR/$DATE/maintenance-report.txt"健康檢查指令碼
python
# health_check.py
import requests
import json
import sys
from datetime import datetime
class ClaudeCodeHealthChecker:
def __init__(self, api_base_url='http://localhost:8080'):
self.api_base_url = api_base_url
self.checks = []
def check_api_health(self):
"""检查 API 健康状态"""
try:
response = requests.get(f'{self.api_base_url}/health', timeout=5)
if response.status_code == 200:
data = response.json()
self.checks.append({
"name": "API Health",
"status": "healthy",
"details": data
})
return True
else:
self.checks.append({
"name": "API Health",
"status": "unhealthy",
"details": f"Status code: {response.status_code}"
})
return False
except Exception as e:
self.checks.append({
"name": "API Health",
"status": "unhealthy",
"details": str(e)
})
return False
def check_llm_gateway(self):
"""检查 LLM 网关"""
try:
response = requests.get('http://localhost:4000/health', timeout=5)
if response.status_code == 200:
self.checks.append({
"name": "LLM Gateway",
"status": "healthy",
"details": response.json()
})
return True
else:
self.checks.append({
"name": "LLM Gateway",
"status": "unhealthy",
"details": f"Status code: {response.status_code}"
})
return False
except Exception as e:
self.checks.append({
"name": "LLM Gateway",
"status": "unhealthy",
"details": str(e)
})
return False
def check_sandbox(self):
"""检查沙箱状态"""
try:
response = requests.get(f'{self.api_base_url}/sandbox/status', timeout=5)
if response.status_code == 200:
data = response.json()
self.checks.append({
"name": "Sandbox",
"status": "healthy",
"details": data
})
return True
else:
self.checks.append({
"name": "Sandbox",
"status": "unhealthy",
"details": f"Status code: {response.status_code}"
})
return False
except Exception as e:
self.checks.append({
"name": "Sandbox",
"status": "unhealthy",
"details": str(e)
})
return False
def check_disk_space(self, threshold=90):
"""检查磁盘空间"""
import shutil
usage = shutil.disk_usage('/')
percent = (usage.used / usage.total) * 100
if percent < threshold:
self.checks.append({
"name": "Disk Space",
"status": "healthy",
"details": f"Usage: {percent:.1f}%"
})
return True
else:
self.checks.append({
"name": "Disk Space",
"status": "unhealthy",
"details": f"Usage: {percent:.1f}% (Threshold: {threshold}%)"
})
return False
def check_memory(self, threshold=90):
"""检查内存使用"""
import psutil
percent = psutil.virtual_memory().percent
if percent < threshold:
self.checks.append({
"name": "Memory",
"status": "healthy",
"details": f"Usage: {percent:.1f}%"
})
return True
else:
self.checks.append({
"name": "Memory",
"status": "unhealthy",
"details": f"Usage: {percent:.1f}% (Threshold: {threshold}%)"
})
return False
def run_all_checks(self):
"""运行所有检查"""
self.check_api_health()
self.check_llm_gateway()
self.check_sandbox()
self.check_disk_space()
self.check_memory()
return self.checks
def generate_report(self):
"""生成健康检查报告"""
report = {
"timestamp": datetime.now().isoformat(),
"overall_status": "healthy",
"checks": self.checks
}
# 确定整体状态
for check in self.checks:
if check['status'] == 'unhealthy':
report['overall_status'] = 'unhealthy'
break
return report
def print_report(self):
"""打印报告"""
report = self.generate_report()
print("=" * 50)
print(f"Claude Code 健康检查报告")
print(f"时间: {report['timestamp']}")
print(f"整体状态: {report['overall_status'].upper()}")
print("=" * 50)
for check in report['checks']:
status_icon = "✓" if check['status'] == 'healthy' else "✗"
print(f"{status_icon} {check['name']}: {check['status']}")
print(f" 详情: {check['details']}")
print()
return report['overall_status'] == 'healthy'
if __name__ == '__main__':
checker = ClaudeCodeHealthChecker()
checker.run_all_checks()
is_healthy = checker.print_report()
sys.exit(0 if is_healthy else 1)34.3.7 灾难恢复
备份策略
bash
#!/bin/bash
# backup.sh
set -e
BACKUP_DIR="/backup/claude-code"
DATE=$(date +%Y-%m-%d_%H-%M-%S)
BACKUP_PATH="$BACKUP_DIR/$DATE"
echo "=== Claude Code 备份脚本 - $DATE ==="
# 创建备份目录
mkdir -p $BACKUP_PATH
# 1. 备份配置文件
echo "备份配置文件..."
tar -czf $BACKUP_PATH/config.tar.gz /etc/claude-code
# 2. 备份数据库
echo "备份数据库..."
# pg_dump -U claude claude_code > $BACKUP_PATH/database.sql
# 3. 备份日志
echo "备份日志..."
tar -czf $BACKUP_PATH/logs.tar.gz /var/log/claude-code
# 4. 备份用户数据
echo "备份用户数据..."
tar -czf $BACKUP_PATH/user-data.tar.gz /var/lib/claude-code
# 5. 生成备份清单
echo "生成备份清单..."
cat > $BACKUP_PATH/manifest.txt << EOF
备份清单
日期: $DATE
配置文件: config.tar.gz
数据库: database.sql
日志: logs.tar.gz
用户数据: user-data.tar.gz
文件大小:
$(du -sh $BACKUP_PATH/*)
EOF
# 6. 上传到远程存储(可选)
echo "上传到远程存储..."
# aws s3 cp $BACKUP_PATH s3://company-backups/claude-code/$DATE --recursive
# 7. 清理旧备份(保留最近 30 天)
echo "清理旧备份..."
find $BACKUP_DIR -type d -mtime +30 -exec rm -rf {} \;
echo "备份完成!备份位置: $BACKUP_PATH"恢復指令碼
bash
#!/bin/bash
# restore.sh
set -e
if [ -z "$1" ]; then
echo "用法: $0 <备份目录>"
exit 1
fi
BACKUP_PATH="$1"
echo "=== Claude Code 恢复脚本 ==="
echo "备份目录: $BACKUP_PATH"
# 1. 停止服务
echo "停止服务..."
systemctl stop claude-code
# 2. 恢复配置文件
echo "恢复配置文件..."
tar -xzf $BACKUP_PATH/config.tar.gz -C /
# 3. 恢复数据库
echo "恢复数据库..."
# psql -U claude -d claude_code < $BACKUP_PATH/database.sql
# 4. 恢复用户数据
echo "恢复用户数据..."
tar -xzf $BACKUP_PATH/user-data.tar.gz -C /
# 5. 启动服务
echo "启动服务..."
systemctl start claude-code
# 6. 验证恢复
echo "验证恢复..."
sleep 5
if systemctl is-active --quiet claude-code; then
echo "服务启动成功!"
else
echo "服务启动失败!"
exit 1
fi
echo "恢复完成!"34.3.8 性能优化
缓存策略
python
# cache_manager.py
import redis
import json
from datetime import datetime, timedelta
class CacheManager:
def __init__(self, redis_host='localhost', redis_port=6379):
self.redis = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
def cache_api_response(self, key, response, ttl=3600):
"""缓存 API 响应"""
self.redis.setex(key, ttl, json.dumps(response))
def get_cached_response(self, key):
"""获取缓存的响应"""
cached = self.redis.get(key)
if cached:
return json.loads(cached)
return None
def cache_token_count(self, user_id, count, ttl=86400):
"""缓存令牌计数"""
key = f"tokens:{user_id}:{datetime.now().strftime('%Y-%m-%d')}"
self.redis.incrby(key, count)
self.redis.expire(key, ttl)
def get_token_count(self, user_id):
"""获取令牌计数"""
key = f"tokens:{user_id}:{datetime.now().strftime('%Y-%m-%d')}"
count = self.redis.get(key)
return int(count) if count else 0
def cache_model_response(self, model, prompt_hash, response, ttl=7200):
"""缓存模型响应"""
key = f"model:{model}:{prompt_hash}"
self.redis.setex(key, ttl, json.dumps(response))
def get_cached_model_response(self, model, prompt_hash):
"""获取缓存的模型响应"""
key = f"model:{model}:{prompt_hash}"
cached = self.redis.get(key)
if cached:
return json.loads(cached)
return None
# 使用示例
cache = CacheManager()
# 缓存 API 响应
cache.cache_api_response("api:user:123:profile", {"name": "John"}, ttl=3600)
# 获取缓存的响应
cached = cache.get_cached_response("api:user:123:profile")負載均衡配置
nginx
# nginx.conf
upstream claude_code_backend {
least_conn;
server claude-code-1:8080 weight=3;
server claude-code-2:8080 weight=2;
server claude-code-3:8080 weight=1;
keepalive 32;
}
server {
listen 80;
server_name claude-code.company.com;
# 重定向到 HTTPS
return 301 https://$server_name$request_uri;
}
server {
listen 443 ssl http2;
server_name claude-code.company.com;
ssl_certificate /etc/nginx/ssl/claude-code.crt;
ssl_certificate_key /etc/nginx/ssl/claude-code.key;
# SSL 配置
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
ssl_prefer_server_ciphers on;
# 日志
access_log /var/log/nginx/claude-code-access.log;
error_log /var/log/nginx/claude-code-error.log;
# 代理配置
location / {
proxy_pass http://claude_code_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 超时配置
proxy_connect_timeout 60s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
# 缓冲配置
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
proxy_busy_buffers_size 8k;
# 健康检查
health_check interval=10s fails=3 passes=2;
}
# 健康检查端点
location /health {
proxy_pass http://claude_code_backend/health;
access_log off;
}
}34.3.9 小結
本節介紹了企業級監控和維護的各個方面,包括:
- 監控體系概述和監控維度
- 指標收集(Prometheus、自定義匯出器)
- 告警配置(Prometheus、Alertmanager)
- 視覺化儀表板(Grafana)
- 日誌分析(ELK Stack)
- 維護策略(定期維護、健康檢查)
- 災難恢復(備份和恢復)
- 效能最佳化(快取、負載均衡)
透過建立完善的監控和維護體系,企業可以確保 Claude Code 在生產環境中的穩定執行,及時發現和解決問題,最佳化效能和成本控制。