512 lines
14 KiB
Bash
512 lines
14 KiB
Bash
#!/bin/bash
|
||
|
||
# OOM监控配置
|
||
readonly MONITOR_INTERVAL=30 # 检查间隔(秒)
|
||
readonly OOM_KEYWORDS=("java.lang.OutOfMemoryError" "OutOfMemoryError" "java.lang.OutOfMemory")
|
||
readonly MAX_RESTART_ATTEMPTS=3 # 最大重启尝试次数
|
||
readonly RESTART_COOLDOWN=300 # 重启冷却时间(秒)
|
||
|
||
# 内存阈值配置(根据您的实际情况调整)
|
||
readonly MEMORY_THRESHOLD_GB=6 # 内存阈值(GB),超过此值才认为是内存使用过高
|
||
readonly MEMORY_THRESHOLD_KB=$((MEMORY_THRESHOLD_GB * 1024 * 1024)) # 转换为KB
|
||
|
||
# 日志配置
|
||
readonly LOG_FILE="/home/chenwujian2/script/logs/oom_monitor.log"
|
||
readonly STATUS_FILE="/home/chenwujian2/script/logs/oom_status.json"
|
||
|
||
# Tomcat实例配置 (端口:基础路径)
|
||
declare -A TOMCAT_INSTANCES=(
|
||
[90105]="/home/chenwujian"
|
||
[90106]="/home/chenwujian"
|
||
[90107]="/home/chenwujian2"
|
||
[90108]="/home/chenwujian2"
|
||
)
|
||
|
||
# 颜色定义
|
||
readonly RED='\033[0;31m'
|
||
readonly GREEN='\033[0;32m'
|
||
readonly YELLOW='\033[1;33m'
|
||
readonly BLUE='\033[0;34m'
|
||
readonly CYAN='\033[0;36m'
|
||
readonly NC='\033[0m'
|
||
|
||
# 获取Tomcat完整路径
|
||
get_tomcat_path() {
|
||
local port=$1
|
||
local base_path="${TOMCAT_INSTANCES[$port]}"
|
||
echo "${base_path}/tomcat${port}"
|
||
}
|
||
|
||
# 获取所有监控的端口
|
||
get_monitor_ports() {
|
||
echo "${!TOMCAT_INSTANCES[@]}"
|
||
}
|
||
|
||
# 日志函数
|
||
log() {
|
||
local message="$1"
|
||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||
echo -e "${GREEN}[${timestamp}] $message${NC}"
|
||
echo "[${timestamp}] $message" >> "$LOG_FILE"
|
||
}
|
||
|
||
warn() {
|
||
local message="$1"
|
||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||
echo -e "${YELLOW}[${timestamp}] 警告: $message${NC}"
|
||
echo "[${timestamp}] 警告: $message" >> "$LOG_FILE"
|
||
}
|
||
|
||
error() {
|
||
local message="$1"
|
||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||
echo -e "${RED}[${timestamp}] 错误: $message${NC}"
|
||
echo "[${timestamp}] 错误: $message" >> "$LOG_FILE"
|
||
}
|
||
|
||
info() {
|
||
local message="$1"
|
||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||
echo -e "${BLUE}[${timestamp}] $message${NC}"
|
||
echo "[${timestamp}] $message" >> "$LOG_FILE"
|
||
}
|
||
|
||
# 初始化状态文件
|
||
init_status_file() {
|
||
if [ ! -f "$STATUS_FILE" ]; then
|
||
cat > "$STATUS_FILE" << EOF
|
||
{
|
||
"monitor_start_time": "$(date '+%Y-%m-%d %H:%M:%S')",
|
||
"restart_records": {}
|
||
}
|
||
EOF
|
||
fi
|
||
|
||
# 创建日志目录
|
||
local log_dir=$(dirname "$LOG_FILE")
|
||
mkdir -p "$log_dir"
|
||
}
|
||
|
||
# 获取Tomcat进程PID
|
||
get_tomcat_pid() {
|
||
local port=$1
|
||
local tomcat_path=$(get_tomcat_path "$port")
|
||
|
||
# 方法1: 通过catalina.base路径查找
|
||
local pid=$(ps -ef | grep java | grep "catalina.base=${tomcat_path}" | grep -v grep | awk '{print $2}')
|
||
|
||
# 方法2: 如果方法1没找到,通过端口查找
|
||
if [ -z "$pid" ]; then
|
||
pid=$(netstat -tlnp 2>/dev/null | grep ":${port} " | awk '{print $7}' | cut -d'/' -f1 | grep -v '^-$' | grep -v '^$')
|
||
fi
|
||
|
||
echo "$pid"
|
||
}
|
||
|
||
# 检查Tomcat是否运行
|
||
is_tomcat_running() {
|
||
local port=$1
|
||
local pid=$(get_tomcat_pid "$port")
|
||
if [ -n "$pid" ] && ps -p "$pid" > /dev/null 2>&1; then
|
||
return 0
|
||
else
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 检查OOM错误
|
||
check_oom_error() {
|
||
local port=$1
|
||
local tomcat_path=$(get_tomcat_path "$port")
|
||
local log_file="${tomcat_path}/logs/catalina.out"
|
||
|
||
if [ ! -f "$log_file" ]; then
|
||
return 1
|
||
fi
|
||
|
||
# 检查最近的日志中是否有OOM错误(最近1000行)
|
||
for keyword in "${OOM_KEYWORDS[@]}"; do
|
||
if tail -n 1000 "$log_file" | grep -i "$keyword" > /dev/null 2>&1; then
|
||
# 获取OOM错误的具体信息
|
||
local oom_line=$(tail -n 1000 "$log_file" | grep -i "$keyword" | tail -1)
|
||
log "检测到Tomcat $port OOM错误: $oom_line"
|
||
return 0
|
||
fi
|
||
done
|
||
|
||
return 1
|
||
}
|
||
|
||
# 检查系统dmesg中的OOM killer
|
||
check_dmesg_oom() {
|
||
local port=$1
|
||
local pid=$(get_tomcat_pid "$port")
|
||
|
||
if [ -n "$pid" ]; then
|
||
# 检查dmesg中是否有该进程被OOM killer杀死的记录
|
||
if dmesg | grep -i "killed process.*java.*$pid" > /dev/null 2>&1; then
|
||
local oom_info=$(dmesg | grep -i "killed process.*java.*$pid" | tail -1)
|
||
log "检测到系统OOM Killer杀死了Tomcat $port (PID: $pid): $oom_info"
|
||
return 0
|
||
fi
|
||
fi
|
||
|
||
return 1
|
||
}
|
||
|
||
# 检查内存使用情况(改进版)
|
||
check_memory_usage() {
|
||
local port=$1
|
||
local pid=$(get_tomcat_pid "$port")
|
||
|
||
if [ -z "$pid" ]; then
|
||
return 1
|
||
fi
|
||
|
||
# 获取Java进程内存使用情况
|
||
local mem_info=$(ps -o pid,rss,vsz,pmem,comm -p "$pid" 2>/dev/null | grep java)
|
||
if [ -n "$mem_info" ]; then
|
||
local rss=$(echo "$mem_info" | awk '{print $2}') # 物理内存(KB)
|
||
local vsz=$(echo "$mem_info" | awk '{print $3}') # 虚拟内存(KB)
|
||
local rss_gb=$((rss / 1024 / 1024))
|
||
local vsz_gb=$((vsz / 1024 / 1024))
|
||
|
||
# 记录内存使用情况(不触发重启)
|
||
info "Tomcat $port 内存使用: RSS=${rss_gb}GB (${rss}KB) VSZ=${vsz_gb}GB (${vsz}KB)"
|
||
|
||
# 如果RSS超过阈值,认为内存使用过高
|
||
if [ "$rss" -gt "$MEMORY_THRESHOLD_KB" ]; then
|
||
warn "Tomcat $port 内存使用过高: RSS=${rss_gb}GB (超过${MEMORY_THRESHOLD_GB}GB阈值)"
|
||
return 0
|
||
fi
|
||
fi
|
||
|
||
return 1
|
||
}
|
||
|
||
# 重启Tomcat
|
||
restart_tomcat() {
|
||
local port=$1
|
||
local reason="$2"
|
||
local tomcat_path=$(get_tomcat_path "$port")
|
||
|
||
log "准备重启Tomcat $port,原因: $reason"
|
||
log "Tomcat路径: $tomcat_path"
|
||
|
||
# 停止Tomcat
|
||
if [ -f "${tomcat_path}/bin/shutdown.sh" ]; then
|
||
info "停止Tomcat $port..."
|
||
if sh "${tomcat_path}/bin/shutdown.sh"; then
|
||
log "Tomcat $port 停止命令已发送"
|
||
else
|
||
warn "Tomcat $port 停止过程中可能出现问题"
|
||
fi
|
||
else
|
||
error "Tomcat $port 停止脚本不存在: ${tomcat_path}/bin/shutdown.sh"
|
||
return 1
|
||
fi
|
||
|
||
# 等待停止
|
||
sleep 5
|
||
|
||
# 强制杀死残留进程
|
||
local pid=$(get_tomcat_pid "$port")
|
||
if [ -n "$pid" ]; then
|
||
warn "强制杀死Tomcat $port 残留进程: $pid"
|
||
kill -9 "$pid" 2>/dev/null
|
||
sleep 2
|
||
fi
|
||
|
||
# 备份OOM日志(只有在真正OOM时才备份)
|
||
if [ "$reason" = "应用层OOM错误" ] || [ "$reason" = "系统OOM Killer" ]; then
|
||
local log_file="${tomcat_path}/logs/catalina.out"
|
||
if [ -f "$log_file" ]; then
|
||
local backup_name="${log_file}.oom.$(date +%Y%m%d_%H%M%S)"
|
||
cp "$log_file" "$backup_name"
|
||
log "OOM日志已备份: $backup_name"
|
||
|
||
# 清空原日志文件
|
||
> "$log_file"
|
||
fi
|
||
fi
|
||
|
||
# 启动Tomcat
|
||
if [ -f "${tomcat_path}/bin/startup.sh" ]; then
|
||
info "启动Tomcat $port..."
|
||
if sh "${tomcat_path}/bin/startup.sh"; then
|
||
log "Tomcat $port 启动命令已执行"
|
||
|
||
# 等待启动
|
||
sleep 10
|
||
|
||
# 验证是否启动成功
|
||
if is_tomcat_running "$port"; then
|
||
log "Tomcat $port 重启成功"
|
||
return 0
|
||
else
|
||
error "Tomcat $port 启动后未运行"
|
||
return 1
|
||
fi
|
||
else
|
||
error "Tomcat $port 启动失败"
|
||
return 1
|
||
fi
|
||
else
|
||
error "Tomcat $port 启动脚本不存在"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 更新重启记录
|
||
update_restart_record() {
|
||
local port=$1
|
||
local reason="$2"
|
||
local success="$3"
|
||
|
||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||
local temp_file="${STATUS_FILE}.tmp"
|
||
|
||
# 使用jq更新JSON文件,如果没有jq则使用sed
|
||
if command -v jq >/dev/null 2>&1; then
|
||
jq --arg port "$port" \
|
||
--arg timestamp "$timestamp" \
|
||
--arg reason "$reason" \
|
||
--arg success "$success" \
|
||
'.restart_records[$port] += [{"time": $timestamp, "reason": $reason, "success": $success}]' \
|
||
"$STATUS_FILE" > "$temp_file" && mv "$temp_file" "$STATUS_FILE"
|
||
else
|
||
# 简单的文本处理(如果没有jq)
|
||
echo "重启记录 - 端口: $port, 时间: $timestamp, 原因: $reason, 成功: $success" >> "$LOG_FILE"
|
||
fi
|
||
}
|
||
|
||
# 检查重启频率
|
||
can_restart() {
|
||
local port=$1
|
||
local current_time=$(date +%s)
|
||
|
||
# 检查冷却时间
|
||
if [ -f "/tmp/tomcat_${port}_restart_time" ]; then
|
||
local last_restart=$(cat "/tmp/tomcat_${port}_restart_time")
|
||
local time_diff=$((current_time - last_restart))
|
||
|
||
if [ $time_diff -lt $RESTART_COOLDOWN ]; then
|
||
local remaining=$((RESTART_COOLDOWN - time_diff))
|
||
warn "Tomcat $port 在冷却期内,${remaining}秒后可重启"
|
||
return 1
|
||
fi
|
||
fi
|
||
|
||
# 检查重启次数
|
||
local restart_count_file="/tmp/tomcat_${port}_restart_count"
|
||
local restart_count=0
|
||
if [ -f "$restart_count_file" ]; then
|
||
restart_count=$(cat "$restart_count_file")
|
||
fi
|
||
|
||
# 每天重置计数
|
||
local today=$(date +%Y%m%d)
|
||
local count_date_file="/tmp/tomcat_${port}_restart_date"
|
||
if [ -f "$count_date_file" ]; then
|
||
local last_date=$(cat "$count_date_file")
|
||
if [ "$last_date" != "$today" ]; then
|
||
restart_count=0
|
||
echo "$restart_count" > "$restart_count_file"
|
||
echo "$today" > "$count_date_file"
|
||
fi
|
||
else
|
||
echo "$today" > "$count_date_file"
|
||
fi
|
||
|
||
if [ $restart_count -ge $MAX_RESTART_ATTEMPTS ]; then
|
||
error "Tomcat $port 今日重启次数已达上限($MAX_RESTART_ATTEMPTS次),不再自动重启"
|
||
return 1
|
||
fi
|
||
|
||
return 0
|
||
}
|
||
|
||
# 更新重启计数
|
||
update_restart_count() {
|
||
local port=$1
|
||
local restart_count_file="/tmp/tomcat_${port}_restart_count"
|
||
local restart_count=0
|
||
|
||
if [ -f "$restart_count_file" ]; then
|
||
restart_count=$(cat "$restart_count_file")
|
||
fi
|
||
|
||
restart_count=$((restart_count + 1))
|
||
echo "$restart_count" > "$restart_count_file"
|
||
echo "$(date +%s)" > "/tmp/tomcat_${port}_restart_time"
|
||
}
|
||
|
||
# 监控单个Tomcat实例
|
||
monitor_tomcat() {
|
||
local port=$1
|
||
|
||
if ! is_tomcat_running "$port"; then
|
||
warn "Tomcat $port 未运行"
|
||
return
|
||
fi
|
||
|
||
local oom_detected=false
|
||
local oom_reason=""
|
||
|
||
# 检查OOM错误(最高优先级)
|
||
if check_oom_error "$port"; then
|
||
oom_detected=true
|
||
oom_reason="应用层OOM错误"
|
||
elif check_dmesg_oom "$port"; then
|
||
oom_detected=true
|
||
oom_reason="系统OOM Killer"
|
||
elif check_memory_usage "$port"; then
|
||
oom_detected=true
|
||
oom_reason="内存使用过高"
|
||
fi
|
||
|
||
if [ "$oom_detected" = true ]; then
|
||
warn "检测到Tomcat $port 需要重启: $oom_reason"
|
||
|
||
if can_restart "$port"; then
|
||
info "开始重启Tomcat $port..."
|
||
if restart_tomcat "$port" "$oom_reason"; then
|
||
log "Tomcat $port 重启成功"
|
||
update_restart_record "$port" "$oom_reason" "true"
|
||
update_restart_count "$port"
|
||
else
|
||
error "Tomcat $port 重启失败"
|
||
update_restart_record "$port" "$oom_reason" "false"
|
||
fi
|
||
fi
|
||
fi
|
||
}
|
||
|
||
# 显示监控状态
|
||
show_monitor_status() {
|
||
log "=== Tomcat OOM监控状态 ==="
|
||
log "监控节点: $(get_monitor_ports | tr '\n' ' ')"
|
||
log "检查间隔: ${MONITOR_INTERVAL}秒"
|
||
log "内存阈值: ${MEMORY_THRESHOLD_GB}GB"
|
||
log "日志文件: $LOG_FILE"
|
||
log "状态文件: $STATUS_FILE"
|
||
echo
|
||
|
||
for port in $(get_monitor_ports); do
|
||
local base_path="${TOMCAT_INSTANCES[$port]}"
|
||
if is_tomcat_running "$port"; then
|
||
local pid=$(get_tomcat_pid "$port")
|
||
# 显示内存使用情况
|
||
local mem_info=$(ps -o rss -p "$pid" 2>/dev/null | tail -1)
|
||
local rss_gb=$((mem_info / 1024 / 1024))
|
||
echo -e "${GREEN}Tomcat $port: 运行中 (PID: $pid, 内存: ~${rss_gb}GB, 路径: ${base_path}/tomcat${port})${NC}"
|
||
else
|
||
echo -e "${RED}Tomcat $port: 未运行 (路径: ${base_path}/tomcat${port})${NC}"
|
||
fi
|
||
done
|
||
echo
|
||
}
|
||
|
||
# 主监控循环
|
||
start_monitor() {
|
||
log "启动Tomcat OOM监控服务..."
|
||
log "监控端口: $(get_monitor_ports | tr '\n' ' ')"
|
||
log "检查间隔: ${MONITOR_INTERVAL}秒"
|
||
log "内存阈值: ${MEMORY_THRESHOLD_GB}GB"
|
||
log "按 Ctrl+C 停止监控"
|
||
echo
|
||
|
||
# 显示初始状态
|
||
show_monitor_status
|
||
|
||
# 主循环
|
||
while true; do
|
||
for port in $(get_monitor_ports); do
|
||
monitor_tomcat "$port"
|
||
done
|
||
|
||
# 等待下一次检查
|
||
sleep $MONITOR_INTERVAL
|
||
done
|
||
}
|
||
|
||
# 停止监控
|
||
stop_monitor() {
|
||
log "停止Tomcat OOM监控服务..."
|
||
pkill -f "tomcat.*oom.*monitor" || true
|
||
}
|
||
|
||
# 显示帮助
|
||
show_usage() {
|
||
echo "使用方法: $0 {start|stop|status|restart|once}"
|
||
echo
|
||
echo "命令说明:"
|
||
echo " start 启动OOM监控服务"
|
||
echo " stop 停止OOM监控服务"
|
||
echo " status 查看监控状态"
|
||
echo " restart 重启监控服务"
|
||
echo " once 执行一次检查(不持续监控)"
|
||
echo
|
||
echo "监控配置:"
|
||
echo " 监控端口和路径:"
|
||
for port in $(get_monitor_ports); do
|
||
local base_path="${TOMCAT_INSTANCES[$port]}"
|
||
echo " $port -> ${base_path}/tomcat${port}"
|
||
done
|
||
echo " 检查间隔: ${MONITOR_INTERVAL}秒"
|
||
echo " 内存阈值: ${MEMORY_THRESHOLD_GB}GB"
|
||
echo " 日志文件: $LOG_FILE"
|
||
}
|
||
|
||
# 执行单次检查
|
||
run_once() {
|
||
log "执行单次OOM检查..."
|
||
for port in $(get_monitor_ports); do
|
||
monitor_tomcat "$port"
|
||
done
|
||
log "单次检查完成"
|
||
}
|
||
|
||
# 主函数
|
||
main() {
|
||
local command="${1:-start}"
|
||
|
||
# 初始化
|
||
init_status_file
|
||
|
||
case "$command" in
|
||
start)
|
||
start_monitor
|
||
;;
|
||
stop)
|
||
stop_monitor
|
||
;;
|
||
status)
|
||
show_monitor_status
|
||
;;
|
||
restart)
|
||
stop_monitor
|
||
sleep 2
|
||
start_monitor
|
||
;;
|
||
once)
|
||
run_once
|
||
;;
|
||
help|--help|-h)
|
||
show_usage
|
||
;;
|
||
*)
|
||
error "未知命令: $command"
|
||
show_usage
|
||
exit 1
|
||
;;
|
||
esac
|
||
}
|
||
|
||
# 脚本入口
|
||
if [ $# -eq 0 ]; then
|
||
show_usage
|
||
exit 1
|
||
fi
|
||
|
||
main "$@" |