Files
qwsy/月度/11月/晨午检脚本/oom_monitor.sh
binghuai 2430c0e683 11.20
2025-11-20 19:01:30 +08:00

512 lines
14 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# OOM监控配置
readonly MONITOR_INTERVAL=30 # 检查间隔(秒)
readonly OOM_KEYWORDS=("java.lang.OutOfMemoryError" "OutOfMemoryError" "java.lang.OutOfMemory")
readonly MAX_RESTART_ATTEMPTS=3 # 最大重启尝试次数
readonly RESTART_COOLDOWN=300 # 重启冷却时间(秒)
# 内存阈值配置(根据您的实际情况调整)
readonly MEMORY_THRESHOLD_GB=6 # 内存阈值GB超过此值才认为是内存使用过高
readonly MEMORY_THRESHOLD_KB=$((MEMORY_THRESHOLD_GB * 1024 * 1024)) # 转换为KB
# 日志配置
readonly LOG_FILE="/home/chenwujian2/script/logs/oom_monitor.log"
readonly STATUS_FILE="/home/chenwujian2/script/logs/oom_status.json"
# Tomcat实例配置 (端口:基础路径)
declare -A TOMCAT_INSTANCES=(
[90105]="/home/chenwujian"
[90106]="/home/chenwujian"
[90107]="/home/chenwujian2"
[90108]="/home/chenwujian2"
)
# 颜色定义
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly CYAN='\033[0;36m'
readonly NC='\033[0m'
# 获取Tomcat完整路径
get_tomcat_path() {
local port=$1
local base_path="${TOMCAT_INSTANCES[$port]}"
echo "${base_path}/tomcat${port}"
}
# 获取所有监控的端口
get_monitor_ports() {
echo "${!TOMCAT_INSTANCES[@]}"
}
# 日志函数
log() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo -e "${GREEN}[${timestamp}] $message${NC}"
echo "[${timestamp}] $message" >> "$LOG_FILE"
}
warn() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo -e "${YELLOW}[${timestamp}] 警告: $message${NC}"
echo "[${timestamp}] 警告: $message" >> "$LOG_FILE"
}
error() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo -e "${RED}[${timestamp}] 错误: $message${NC}"
echo "[${timestamp}] 错误: $message" >> "$LOG_FILE"
}
info() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo -e "${BLUE}[${timestamp}] $message${NC}"
echo "[${timestamp}] $message" >> "$LOG_FILE"
}
# 初始化状态文件
init_status_file() {
if [ ! -f "$STATUS_FILE" ]; then
cat > "$STATUS_FILE" << EOF
{
"monitor_start_time": "$(date '+%Y-%m-%d %H:%M:%S')",
"restart_records": {}
}
EOF
fi
# 创建日志目录
local log_dir=$(dirname "$LOG_FILE")
mkdir -p "$log_dir"
}
# 获取Tomcat进程PID
get_tomcat_pid() {
local port=$1
local tomcat_path=$(get_tomcat_path "$port")
# 方法1: 通过catalina.base路径查找
local pid=$(ps -ef | grep java | grep "catalina.base=${tomcat_path}" | grep -v grep | awk '{print $2}')
# 方法2: 如果方法1没找到通过端口查找
if [ -z "$pid" ]; then
pid=$(netstat -tlnp 2>/dev/null | grep ":${port} " | awk '{print $7}' | cut -d'/' -f1 | grep -v '^-$' | grep -v '^$')
fi
echo "$pid"
}
# 检查Tomcat是否运行
is_tomcat_running() {
local port=$1
local pid=$(get_tomcat_pid "$port")
if [ -n "$pid" ] && ps -p "$pid" > /dev/null 2>&1; then
return 0
else
return 1
fi
}
# 检查OOM错误
check_oom_error() {
local port=$1
local tomcat_path=$(get_tomcat_path "$port")
local log_file="${tomcat_path}/logs/catalina.out"
if [ ! -f "$log_file" ]; then
return 1
fi
# 检查最近的日志中是否有OOM错误最近1000行
for keyword in "${OOM_KEYWORDS[@]}"; do
if tail -n 1000 "$log_file" | grep -i "$keyword" > /dev/null 2>&1; then
# 获取OOM错误的具体信息
local oom_line=$(tail -n 1000 "$log_file" | grep -i "$keyword" | tail -1)
log "检测到Tomcat $port OOM错误: $oom_line"
return 0
fi
done
return 1
}
# 检查系统dmesg中的OOM killer
check_dmesg_oom() {
local port=$1
local pid=$(get_tomcat_pid "$port")
if [ -n "$pid" ]; then
# 检查dmesg中是否有该进程被OOM killer杀死的记录
if dmesg | grep -i "killed process.*java.*$pid" > /dev/null 2>&1; then
local oom_info=$(dmesg | grep -i "killed process.*java.*$pid" | tail -1)
log "检测到系统OOM Killer杀死了Tomcat $port (PID: $pid): $oom_info"
return 0
fi
fi
return 1
}
# 检查内存使用情况(改进版)
check_memory_usage() {
local port=$1
local pid=$(get_tomcat_pid "$port")
if [ -z "$pid" ]; then
return 1
fi
# 获取Java进程内存使用情况
local mem_info=$(ps -o pid,rss,vsz,pmem,comm -p "$pid" 2>/dev/null | grep java)
if [ -n "$mem_info" ]; then
local rss=$(echo "$mem_info" | awk '{print $2}') # 物理内存(KB)
local vsz=$(echo "$mem_info" | awk '{print $3}') # 虚拟内存(KB)
local rss_gb=$((rss / 1024 / 1024))
local vsz_gb=$((vsz / 1024 / 1024))
# 记录内存使用情况(不触发重启)
info "Tomcat $port 内存使用: RSS=${rss_gb}GB (${rss}KB) VSZ=${vsz_gb}GB (${vsz}KB)"
# 如果RSS超过阈值认为内存使用过高
if [ "$rss" -gt "$MEMORY_THRESHOLD_KB" ]; then
warn "Tomcat $port 内存使用过高: RSS=${rss_gb}GB (超过${MEMORY_THRESHOLD_GB}GB阈值)"
return 0
fi
fi
return 1
}
# 重启Tomcat
restart_tomcat() {
local port=$1
local reason="$2"
local tomcat_path=$(get_tomcat_path "$port")
log "准备重启Tomcat $port,原因: $reason"
log "Tomcat路径: $tomcat_path"
# 停止Tomcat
if [ -f "${tomcat_path}/bin/shutdown.sh" ]; then
info "停止Tomcat $port..."
if sh "${tomcat_path}/bin/shutdown.sh"; then
log "Tomcat $port 停止命令已发送"
else
warn "Tomcat $port 停止过程中可能出现问题"
fi
else
error "Tomcat $port 停止脚本不存在: ${tomcat_path}/bin/shutdown.sh"
return 1
fi
# 等待停止
sleep 5
# 强制杀死残留进程
local pid=$(get_tomcat_pid "$port")
if [ -n "$pid" ]; then
warn "强制杀死Tomcat $port 残留进程: $pid"
kill -9 "$pid" 2>/dev/null
sleep 2
fi
# 备份OOM日志只有在真正OOM时才备份
if [ "$reason" = "应用层OOM错误" ] || [ "$reason" = "系统OOM Killer" ]; then
local log_file="${tomcat_path}/logs/catalina.out"
if [ -f "$log_file" ]; then
local backup_name="${log_file}.oom.$(date +%Y%m%d_%H%M%S)"
cp "$log_file" "$backup_name"
log "OOM日志已备份: $backup_name"
# 清空原日志文件
> "$log_file"
fi
fi
# 启动Tomcat
if [ -f "${tomcat_path}/bin/startup.sh" ]; then
info "启动Tomcat $port..."
if sh "${tomcat_path}/bin/startup.sh"; then
log "Tomcat $port 启动命令已执行"
# 等待启动
sleep 10
# 验证是否启动成功
if is_tomcat_running "$port"; then
log "Tomcat $port 重启成功"
return 0
else
error "Tomcat $port 启动后未运行"
return 1
fi
else
error "Tomcat $port 启动失败"
return 1
fi
else
error "Tomcat $port 启动脚本不存在"
return 1
fi
}
# 更新重启记录
update_restart_record() {
local port=$1
local reason="$2"
local success="$3"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local temp_file="${STATUS_FILE}.tmp"
# 使用jq更新JSON文件如果没有jq则使用sed
if command -v jq >/dev/null 2>&1; then
jq --arg port "$port" \
--arg timestamp "$timestamp" \
--arg reason "$reason" \
--arg success "$success" \
'.restart_records[$port] += [{"time": $timestamp, "reason": $reason, "success": $success}]' \
"$STATUS_FILE" > "$temp_file" && mv "$temp_file" "$STATUS_FILE"
else
# 简单的文本处理如果没有jq
echo "重启记录 - 端口: $port, 时间: $timestamp, 原因: $reason, 成功: $success" >> "$LOG_FILE"
fi
}
# 检查重启频率
can_restart() {
local port=$1
local current_time=$(date +%s)
# 检查冷却时间
if [ -f "/tmp/tomcat_${port}_restart_time" ]; then
local last_restart=$(cat "/tmp/tomcat_${port}_restart_time")
local time_diff=$((current_time - last_restart))
if [ $time_diff -lt $RESTART_COOLDOWN ]; then
local remaining=$((RESTART_COOLDOWN - time_diff))
warn "Tomcat $port 在冷却期内,${remaining}秒后可重启"
return 1
fi
fi
# 检查重启次数
local restart_count_file="/tmp/tomcat_${port}_restart_count"
local restart_count=0
if [ -f "$restart_count_file" ]; then
restart_count=$(cat "$restart_count_file")
fi
# 每天重置计数
local today=$(date +%Y%m%d)
local count_date_file="/tmp/tomcat_${port}_restart_date"
if [ -f "$count_date_file" ]; then
local last_date=$(cat "$count_date_file")
if [ "$last_date" != "$today" ]; then
restart_count=0
echo "$restart_count" > "$restart_count_file"
echo "$today" > "$count_date_file"
fi
else
echo "$today" > "$count_date_file"
fi
if [ $restart_count -ge $MAX_RESTART_ATTEMPTS ]; then
error "Tomcat $port 今日重启次数已达上限($MAX_RESTART_ATTEMPTS次),不再自动重启"
return 1
fi
return 0
}
# 更新重启计数
update_restart_count() {
local port=$1
local restart_count_file="/tmp/tomcat_${port}_restart_count"
local restart_count=0
if [ -f "$restart_count_file" ]; then
restart_count=$(cat "$restart_count_file")
fi
restart_count=$((restart_count + 1))
echo "$restart_count" > "$restart_count_file"
echo "$(date +%s)" > "/tmp/tomcat_${port}_restart_time"
}
# 监控单个Tomcat实例
monitor_tomcat() {
local port=$1
if ! is_tomcat_running "$port"; then
warn "Tomcat $port 未运行"
return
fi
local oom_detected=false
local oom_reason=""
# 检查OOM错误最高优先级
if check_oom_error "$port"; then
oom_detected=true
oom_reason="应用层OOM错误"
elif check_dmesg_oom "$port"; then
oom_detected=true
oom_reason="系统OOM Killer"
elif check_memory_usage "$port"; then
oom_detected=true
oom_reason="内存使用过高"
fi
if [ "$oom_detected" = true ]; then
warn "检测到Tomcat $port 需要重启: $oom_reason"
if can_restart "$port"; then
info "开始重启Tomcat $port..."
if restart_tomcat "$port" "$oom_reason"; then
log "Tomcat $port 重启成功"
update_restart_record "$port" "$oom_reason" "true"
update_restart_count "$port"
else
error "Tomcat $port 重启失败"
update_restart_record "$port" "$oom_reason" "false"
fi
fi
fi
}
# 显示监控状态
show_monitor_status() {
log "=== Tomcat OOM监控状态 ==="
log "监控节点: $(get_monitor_ports | tr '\n' ' ')"
log "检查间隔: ${MONITOR_INTERVAL}"
log "内存阈值: ${MEMORY_THRESHOLD_GB}GB"
log "日志文件: $LOG_FILE"
log "状态文件: $STATUS_FILE"
echo
for port in $(get_monitor_ports); do
local base_path="${TOMCAT_INSTANCES[$port]}"
if is_tomcat_running "$port"; then
local pid=$(get_tomcat_pid "$port")
# 显示内存使用情况
local mem_info=$(ps -o rss -p "$pid" 2>/dev/null | tail -1)
local rss_gb=$((mem_info / 1024 / 1024))
echo -e "${GREEN}Tomcat $port: 运行中 (PID: $pid, 内存: ~${rss_gb}GB, 路径: ${base_path}/tomcat${port})${NC}"
else
echo -e "${RED}Tomcat $port: 未运行 (路径: ${base_path}/tomcat${port})${NC}"
fi
done
echo
}
# 主监控循环
start_monitor() {
log "启动Tomcat OOM监控服务..."
log "监控端口: $(get_monitor_ports | tr '\n' ' ')"
log "检查间隔: ${MONITOR_INTERVAL}"
log "内存阈值: ${MEMORY_THRESHOLD_GB}GB"
log "按 Ctrl+C 停止监控"
echo
# 显示初始状态
show_monitor_status
# 主循环
while true; do
for port in $(get_monitor_ports); do
monitor_tomcat "$port"
done
# 等待下一次检查
sleep $MONITOR_INTERVAL
done
}
# 停止监控
stop_monitor() {
log "停止Tomcat OOM监控服务..."
pkill -f "tomcat.*oom.*monitor" || true
}
# 显示帮助
show_usage() {
echo "使用方法: $0 {start|stop|status|restart|once}"
echo
echo "命令说明:"
echo " start 启动OOM监控服务"
echo " stop 停止OOM监控服务"
echo " status 查看监控状态"
echo " restart 重启监控服务"
echo " once 执行一次检查(不持续监控)"
echo
echo "监控配置:"
echo " 监控端口和路径:"
for port in $(get_monitor_ports); do
local base_path="${TOMCAT_INSTANCES[$port]}"
echo " $port -> ${base_path}/tomcat${port}"
done
echo " 检查间隔: ${MONITOR_INTERVAL}"
echo " 内存阈值: ${MEMORY_THRESHOLD_GB}GB"
echo " 日志文件: $LOG_FILE"
}
# 执行单次检查
run_once() {
log "执行单次OOM检查..."
for port in $(get_monitor_ports); do
monitor_tomcat "$port"
done
log "单次检查完成"
}
# 主函数
main() {
local command="${1:-start}"
# 初始化
init_status_file
case "$command" in
start)
start_monitor
;;
stop)
stop_monitor
;;
status)
show_monitor_status
;;
restart)
stop_monitor
sleep 2
start_monitor
;;
once)
run_once
;;
help|--help|-h)
show_usage
;;
*)
error "未知命令: $command"
show_usage
exit 1
;;
esac
}
# 脚本入口
if [ $# -eq 0 ]; then
show_usage
exit 1
fi
main "$@"