#!/bin/bash # OOM监控配置 readonly MONITOR_INTERVAL=30 # 检查间隔(秒) readonly OOM_KEYWORDS=("java.lang.OutOfMemoryError" "OutOfMemoryError" "java.lang.OutOfMemory") readonly MAX_RESTART_ATTEMPTS=3 # 最大重启尝试次数 readonly RESTART_COOLDOWN=300 # 重启冷却时间(秒) # 内存阈值配置(根据您的实际情况调整) readonly MEMORY_THRESHOLD_GB=6 # 内存阈值(GB),超过此值才认为是内存使用过高 readonly MEMORY_THRESHOLD_KB=$((MEMORY_THRESHOLD_GB * 1024 * 1024)) # 转换为KB # 日志配置 readonly LOG_FILE="/home/chenwujian2/script/logs/oom_monitor.log" readonly STATUS_FILE="/home/chenwujian2/script/logs/oom_status.json" # Tomcat实例配置 (端口:基础路径) declare -A TOMCAT_INSTANCES=( [90105]="/home/chenwujian" [90106]="/home/chenwujian" [90107]="/home/chenwujian2" [90108]="/home/chenwujian2" ) # 颜色定义 readonly RED='\033[0;31m' readonly GREEN='\033[0;32m' readonly YELLOW='\033[1;33m' readonly BLUE='\033[0;34m' readonly CYAN='\033[0;36m' readonly NC='\033[0m' # 获取Tomcat完整路径 get_tomcat_path() { local port=$1 local base_path="${TOMCAT_INSTANCES[$port]}" echo "${base_path}/tomcat${port}" } # 获取所有监控的端口 get_monitor_ports() { echo "${!TOMCAT_INSTANCES[@]}" } # 日志函数 log() { local message="$1" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo -e "${GREEN}[${timestamp}] $message${NC}" echo "[${timestamp}] $message" >> "$LOG_FILE" } warn() { local message="$1" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo -e "${YELLOW}[${timestamp}] 警告: $message${NC}" echo "[${timestamp}] 警告: $message" >> "$LOG_FILE" } error() { local message="$1" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo -e "${RED}[${timestamp}] 错误: $message${NC}" echo "[${timestamp}] 错误: $message" >> "$LOG_FILE" } info() { local message="$1" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo -e "${BLUE}[${timestamp}] $message${NC}" echo "[${timestamp}] $message" >> "$LOG_FILE" } # 初始化状态文件 init_status_file() { if [ ! -f "$STATUS_FILE" ]; then cat > "$STATUS_FILE" << EOF { "monitor_start_time": "$(date '+%Y-%m-%d %H:%M:%S')", "restart_records": {} } EOF fi # 创建日志目录 local log_dir=$(dirname "$LOG_FILE") mkdir -p "$log_dir" } # 获取Tomcat进程PID get_tomcat_pid() { local port=$1 local tomcat_path=$(get_tomcat_path "$port") # 方法1: 通过catalina.base路径查找 local pid=$(ps -ef | grep java | grep "catalina.base=${tomcat_path}" | grep -v grep | awk '{print $2}') # 方法2: 如果方法1没找到,通过端口查找 if [ -z "$pid" ]; then pid=$(netstat -tlnp 2>/dev/null | grep ":${port} " | awk '{print $7}' | cut -d'/' -f1 | grep -v '^-$' | grep -v '^$') fi echo "$pid" } # 检查Tomcat是否运行 is_tomcat_running() { local port=$1 local pid=$(get_tomcat_pid "$port") if [ -n "$pid" ] && ps -p "$pid" > /dev/null 2>&1; then return 0 else return 1 fi } # 检查OOM错误 check_oom_error() { local port=$1 local tomcat_path=$(get_tomcat_path "$port") local log_file="${tomcat_path}/logs/catalina.out" if [ ! -f "$log_file" ]; then return 1 fi # 检查最近的日志中是否有OOM错误(最近1000行) for keyword in "${OOM_KEYWORDS[@]}"; do if tail -n 1000 "$log_file" | grep -i "$keyword" > /dev/null 2>&1; then # 获取OOM错误的具体信息 local oom_line=$(tail -n 1000 "$log_file" | grep -i "$keyword" | tail -1) log "检测到Tomcat $port OOM错误: $oom_line" return 0 fi done return 1 } # 检查系统dmesg中的OOM killer check_dmesg_oom() { local port=$1 local pid=$(get_tomcat_pid "$port") if [ -n "$pid" ]; then # 检查dmesg中是否有该进程被OOM killer杀死的记录 if dmesg | grep -i "killed process.*java.*$pid" > /dev/null 2>&1; then local oom_info=$(dmesg | grep -i "killed process.*java.*$pid" | tail -1) log "检测到系统OOM Killer杀死了Tomcat $port (PID: $pid): $oom_info" return 0 fi fi return 1 } # 检查内存使用情况(改进版) check_memory_usage() { local port=$1 local pid=$(get_tomcat_pid "$port") if [ -z "$pid" ]; then return 1 fi # 获取Java进程内存使用情况 local mem_info=$(ps -o pid,rss,vsz,pmem,comm -p "$pid" 2>/dev/null | grep java) if [ -n "$mem_info" ]; then local rss=$(echo "$mem_info" | awk '{print $2}') # 物理内存(KB) local vsz=$(echo "$mem_info" | awk '{print $3}') # 虚拟内存(KB) local rss_gb=$((rss / 1024 / 1024)) local vsz_gb=$((vsz / 1024 / 1024)) # 记录内存使用情况(不触发重启) info "Tomcat $port 内存使用: RSS=${rss_gb}GB (${rss}KB) VSZ=${vsz_gb}GB (${vsz}KB)" # 如果RSS超过阈值,认为内存使用过高 if [ "$rss" -gt "$MEMORY_THRESHOLD_KB" ]; then warn "Tomcat $port 内存使用过高: RSS=${rss_gb}GB (超过${MEMORY_THRESHOLD_GB}GB阈值)" return 0 fi fi return 1 } # 重启Tomcat restart_tomcat() { local port=$1 local reason="$2" local tomcat_path=$(get_tomcat_path "$port") log "准备重启Tomcat $port,原因: $reason" log "Tomcat路径: $tomcat_path" # 停止Tomcat if [ -f "${tomcat_path}/bin/shutdown.sh" ]; then info "停止Tomcat $port..." if sh "${tomcat_path}/bin/shutdown.sh"; then log "Tomcat $port 停止命令已发送" else warn "Tomcat $port 停止过程中可能出现问题" fi else error "Tomcat $port 停止脚本不存在: ${tomcat_path}/bin/shutdown.sh" return 1 fi # 等待停止 sleep 5 # 强制杀死残留进程 local pid=$(get_tomcat_pid "$port") if [ -n "$pid" ]; then warn "强制杀死Tomcat $port 残留进程: $pid" kill -9 "$pid" 2>/dev/null sleep 2 fi # 备份OOM日志(只有在真正OOM时才备份) if [ "$reason" = "应用层OOM错误" ] || [ "$reason" = "系统OOM Killer" ]; then local log_file="${tomcat_path}/logs/catalina.out" if [ -f "$log_file" ]; then local backup_name="${log_file}.oom.$(date +%Y%m%d_%H%M%S)" cp "$log_file" "$backup_name" log "OOM日志已备份: $backup_name" # 清空原日志文件 > "$log_file" fi fi # 启动Tomcat if [ -f "${tomcat_path}/bin/startup.sh" ]; then info "启动Tomcat $port..." if sh "${tomcat_path}/bin/startup.sh"; then log "Tomcat $port 启动命令已执行" # 等待启动 sleep 10 # 验证是否启动成功 if is_tomcat_running "$port"; then log "Tomcat $port 重启成功" return 0 else error "Tomcat $port 启动后未运行" return 1 fi else error "Tomcat $port 启动失败" return 1 fi else error "Tomcat $port 启动脚本不存在" return 1 fi } # 更新重启记录 update_restart_record() { local port=$1 local reason="$2" local success="$3" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') local temp_file="${STATUS_FILE}.tmp" # 使用jq更新JSON文件,如果没有jq则使用sed if command -v jq >/dev/null 2>&1; then jq --arg port "$port" \ --arg timestamp "$timestamp" \ --arg reason "$reason" \ --arg success "$success" \ '.restart_records[$port] += [{"time": $timestamp, "reason": $reason, "success": $success}]' \ "$STATUS_FILE" > "$temp_file" && mv "$temp_file" "$STATUS_FILE" else # 简单的文本处理(如果没有jq) echo "重启记录 - 端口: $port, 时间: $timestamp, 原因: $reason, 成功: $success" >> "$LOG_FILE" fi } # 检查重启频率 can_restart() { local port=$1 local current_time=$(date +%s) # 检查冷却时间 if [ -f "/tmp/tomcat_${port}_restart_time" ]; then local last_restart=$(cat "/tmp/tomcat_${port}_restart_time") local time_diff=$((current_time - last_restart)) if [ $time_diff -lt $RESTART_COOLDOWN ]; then local remaining=$((RESTART_COOLDOWN - time_diff)) warn "Tomcat $port 在冷却期内,${remaining}秒后可重启" return 1 fi fi # 检查重启次数 local restart_count_file="/tmp/tomcat_${port}_restart_count" local restart_count=0 if [ -f "$restart_count_file" ]; then restart_count=$(cat "$restart_count_file") fi # 每天重置计数 local today=$(date +%Y%m%d) local count_date_file="/tmp/tomcat_${port}_restart_date" if [ -f "$count_date_file" ]; then local last_date=$(cat "$count_date_file") if [ "$last_date" != "$today" ]; then restart_count=0 echo "$restart_count" > "$restart_count_file" echo "$today" > "$count_date_file" fi else echo "$today" > "$count_date_file" fi if [ $restart_count -ge $MAX_RESTART_ATTEMPTS ]; then error "Tomcat $port 今日重启次数已达上限($MAX_RESTART_ATTEMPTS次),不再自动重启" return 1 fi return 0 } # 更新重启计数 update_restart_count() { local port=$1 local restart_count_file="/tmp/tomcat_${port}_restart_count" local restart_count=0 if [ -f "$restart_count_file" ]; then restart_count=$(cat "$restart_count_file") fi restart_count=$((restart_count + 1)) echo "$restart_count" > "$restart_count_file" echo "$(date +%s)" > "/tmp/tomcat_${port}_restart_time" } # 监控单个Tomcat实例 monitor_tomcat() { local port=$1 if ! is_tomcat_running "$port"; then warn "Tomcat $port 未运行" return fi local oom_detected=false local oom_reason="" # 检查OOM错误(最高优先级) if check_oom_error "$port"; then oom_detected=true oom_reason="应用层OOM错误" elif check_dmesg_oom "$port"; then oom_detected=true oom_reason="系统OOM Killer" elif check_memory_usage "$port"; then oom_detected=true oom_reason="内存使用过高" fi if [ "$oom_detected" = true ]; then warn "检测到Tomcat $port 需要重启: $oom_reason" if can_restart "$port"; then info "开始重启Tomcat $port..." if restart_tomcat "$port" "$oom_reason"; then log "Tomcat $port 重启成功" update_restart_record "$port" "$oom_reason" "true" update_restart_count "$port" else error "Tomcat $port 重启失败" update_restart_record "$port" "$oom_reason" "false" fi fi fi } # 显示监控状态 show_monitor_status() { log "=== Tomcat OOM监控状态 ===" log "监控节点: $(get_monitor_ports | tr '\n' ' ')" log "检查间隔: ${MONITOR_INTERVAL}秒" log "内存阈值: ${MEMORY_THRESHOLD_GB}GB" log "日志文件: $LOG_FILE" log "状态文件: $STATUS_FILE" echo for port in $(get_monitor_ports); do local base_path="${TOMCAT_INSTANCES[$port]}" if is_tomcat_running "$port"; then local pid=$(get_tomcat_pid "$port") # 显示内存使用情况 local mem_info=$(ps -o rss -p "$pid" 2>/dev/null | tail -1) local rss_gb=$((mem_info / 1024 / 1024)) echo -e "${GREEN}Tomcat $port: 运行中 (PID: $pid, 内存: ~${rss_gb}GB, 路径: ${base_path}/tomcat${port})${NC}" else echo -e "${RED}Tomcat $port: 未运行 (路径: ${base_path}/tomcat${port})${NC}" fi done echo } # 主监控循环 start_monitor() { log "启动Tomcat OOM监控服务..." log "监控端口: $(get_monitor_ports | tr '\n' ' ')" log "检查间隔: ${MONITOR_INTERVAL}秒" log "内存阈值: ${MEMORY_THRESHOLD_GB}GB" log "按 Ctrl+C 停止监控" echo # 显示初始状态 show_monitor_status # 主循环 while true; do for port in $(get_monitor_ports); do monitor_tomcat "$port" done # 等待下一次检查 sleep $MONITOR_INTERVAL done } # 停止监控 stop_monitor() { log "停止Tomcat OOM监控服务..." pkill -f "tomcat.*oom.*monitor" || true } # 显示帮助 show_usage() { echo "使用方法: $0 {start|stop|status|restart|once}" echo echo "命令说明:" echo " start 启动OOM监控服务" echo " stop 停止OOM监控服务" echo " status 查看监控状态" echo " restart 重启监控服务" echo " once 执行一次检查(不持续监控)" echo echo "监控配置:" echo " 监控端口和路径:" for port in $(get_monitor_ports); do local base_path="${TOMCAT_INSTANCES[$port]}" echo " $port -> ${base_path}/tomcat${port}" done echo " 检查间隔: ${MONITOR_INTERVAL}秒" echo " 内存阈值: ${MEMORY_THRESHOLD_GB}GB" echo " 日志文件: $LOG_FILE" } # 执行单次检查 run_once() { log "执行单次OOM检查..." for port in $(get_monitor_ports); do monitor_tomcat "$port" done log "单次检查完成" } # 主函数 main() { local command="${1:-start}" # 初始化 init_status_file case "$command" in start) start_monitor ;; stop) stop_monitor ;; status) show_monitor_status ;; restart) stop_monitor sleep 2 start_monitor ;; once) run_once ;; help|--help|-h) show_usage ;; *) error "未知命令: $command" show_usage exit 1 ;; esac } # 脚本入口 if [ $# -eq 0 ]; then show_usage exit 1 fi main "$@"