Chat LangChain故障排查指南
·
Chat LangChain故障排查指南
【免费下载链接】chat-langchain 项目地址: https://gitcode.com/GitHub_Trending/ch/chat-langchain
常见问题与解决方案
1. API密钥验证失败
症状: 模型调用返回认证错误 解决方案:
- 检查环境变量是否设置正确
- 验证API密钥是否有足够配额
- 检查网络连接和代理设置
2. 数据库连接问题
症状: 应用启动失败,数据库连接错误 解决方案:
- 检查数据库服务是否运行
- 验证连接字符串格式
- 检查防火墙和网络策略
3. 模型响应超时
症状: 请求长时间无响应 解决方案:
- 检查模型服务状态
- 调整超时设置
- 启用故障转移机制
4. 内存泄漏
症状: 内存使用持续增长 解决方案:
- 检查是否有未关闭的连接
- 分析内存使用模式
- 考虑增加内存限制或优化代码
诊断命令
# 检查服务状态
docker-compose ps
# 查看应用日志
docker logs chat-langchain-prod
# 检查数据库连接
docker exec chat-postgres psql -U chatuser -d chatlangchain -c "\l"
# 检查Redis状态
docker exec chat-redis redis-cli ping
# 监控资源使用
docker stats chat-langchain-prod
紧急恢复步骤
-
备份当前状态
docker-compose exec chat-postgres pg_dump -U chatuser chatlangchain > backup.sql -
回滚到上一个版本
docker-compose down git checkout previous-tag docker-compose up -d -
临时禁用功能
- 修改配置临时禁用故障模型
- 降低请求频率限制
- 启用维护模式
### 验证步骤:日志分析与监控告警
配置日志分析脚本 `scripts/log_analyzer.py`:
```python
import json
import re
from datetime import datetime, timedelta
from collections import defaultdict
from typing import Dict, List, Tuple
class LogAnalyzer:
"""日志分析工具"""
def __init__(self, log_file: str):
self.log_file = log_file
self.logs = self._load_logs()
def _load_logs(self) -> List[Dict]:
"""加载并解析日志文件"""
logs = []
with open(self.log_file, 'r') as f:
for line in f:
try:
log_entry = json.loads(line.strip())
logs.append(log_entry)
except json.JSONDecodeError:
continue
return logs
def analyze_error_patterns(self, time_window: int = 3600) -> Dict:
"""分析错误模式"""
now = datetime.utcnow()
window_start = now - timedelta(seconds=time_window)
errors_by_type = defaultdict(int)
errors_by_endpoint = defaultdict(int)
for log in self.logs:
log_time = datetime.fromisoformat(log.get('timestamp', ''))
if log_time < window_start:
continue
if log.get('level') == 'ERROR':
error_type = log.get('error', 'unknown')
endpoint = log.get('endpoint', 'unknown')
errors_by_type[error_type] += 1
errors_by_endpoint[endpoint] += 1
return {
'error_types': dict(errors_by_type),
'error_endpoints': dict(errors_by_endpoint),
'total_errors': sum(errors_by_type.values()),
'time_window': f'{time_window} seconds'
}
def detect_performance_issues(self) -> List[Dict]:
"""检测性能问题"""
slow_requests = []
for log in self.logs:
if 'processing_time' in log:
processing_time = log['processing_time']
if processing_time > 5.0: # 超过5秒的请求
slow_requests.append({
'timestamp': log.get('timestamp'),
'endpoint': log.get('endpoint', 'unknown'),
'processing_time': processing_time,
'user_id': log.get('user_id'),
'operation': log.get('operation')
})
# 按处理时间排序
slow_requests.sort(key=lambda x: x['processing_time'], reverse=True)
return slow_requests[:10] # 返回最慢的10个请求
def generate_daily_report(self) -> Dict:
"""生成每日报告"""
today = datetime.utcnow().date()
daily_stats = {
'total_requests': 0,
'successful_requests': 0,
'failed_requests': 0,
'average_response_time': 0,
'top_models': defaultdict(int),
'error_summary': defaultdict(int)
}
response_times = []
for log in self.logs:
log_date = datetime.fromisoformat(log.get('timestamp', '')).date()
if log_date != today:
continue
daily_stats['total_requests'] += 1
if log.get('level') == 'ERROR':
daily_stats['failed_requests'] += 1
error_type = log.get('error', 'unknown')
daily_stats['error_summary'][error_type] += 1
else:
daily_stats['successful_requests'] += 1
if 'processing_time' in log:
response_times.append(log['processing_time'])
if 'model' in log:
daily_stats['top_models'][log['model']] += 1
if response_times:
daily_stats['average_response_time'] = sum(response_times) / len(response_times)
return daily_stats
def check_anomalies(self) -> List[str]:
"""检查异常模式"""
anomalies = []
# 检查错误率突增
error_rate = self._calculate_error_rate()
if error_rate > 0.1: # 错误率超过10%
anomalies.append(f"高错误率: {error_rate:.2%}")
# 检查响应时间异常
avg_response_time = self._calculate_avg_response_time()
if avg_response_time > 3.0: # 平均响应时间超过3秒
anomalies.append(f"响应时间异常: {avg_response_time:.2f}秒")
# 检查特定错误模式
common_errors = self._find_common_errors()
for error, count in common_errors.items():
if count > 10: # 同一错误出现超过10次
anomalies.append(f"频繁错误: {error} (出现{count}次)")
return anomalies
def _calculate_error_rate(self) -> float:
"""计算错误率"""
total = len(self.logs)
if total == 0:
return 0.0
errors = sum(1 for log in self.logs if log.get('level') == 'ERROR')
return errors / total
def _calculate_avg_response_time(self) -> float:
"""计算平均响应时间"""
response_times = [log['processing_time'] for log in self.logs if 'processing_time' in log]
if not response_times:
return 0.0
return sum(response_times) / len(response_times)
def _find_common_errors(self) -> Dict[str, int]:
"""找出常见错误"""
error_counts = defaultdict(int)
for log in self.logs:
if log.get('level') == 'ERROR':
error = log.get('error', 'unknown')
error_counts[error] += 1
return dict(error_counts)
# 使用示例
if __name__ == "__main__":
analyzer = LogAnalyzer("logs/application.log")
# 分析错误模式
error_analysis = analyzer.analyze_error_patterns()
print("错误分析:", json.dumps(error_analysis, indent=2, ensure_ascii=False))
# 检测性能问题
performance_issues = analyzer.detect_performance_issues()
print("性能问题:", json.dumps(performance_issues, indent=2, ensure_ascii=False))
# 生成每日报告
daily_report = analyzer.generate_daily_report()
print("每日报告:", json.dumps(daily_report, indent=2, ensure_ascii=False))
# 检查异常
anomalies = analyzer.check_anomalies()
if anomalies:
print("检测到异常:")
for anomaly in anomalies:
print(f" - {anomaly}")
【免费下载链接】chat-langchain 项目地址: https://gitcode.com/GitHub_Trending/ch/chat-langchain
更多推荐



所有评论(0)