#!/bin/sh
# PCP QA Test No. 1441
# primary pmlogger successfully follows hostname changes
# and
# primary pmie successfully follows hostname changes
#
# Copyright (c) 2023 Ken McDonell.  All Rights Reserved.
#
# :not_in_ci:
#	we're not sure that changing the VM/container hostname to
#	"boofa-1441" is going to lead to happiness.

if [ $# -eq 0 ]
then
    seq=`basename $0`
    echo "QA output created by $seq"
else
    # use $seq from caller, unless not set
    [ -n "$seq" ] || seq=`basename $0`
    echo "QA output created by `basename $0` $*"
fi

# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check

oldhostname=`hostname`
newhostname="boofa-$seq"

if grep "^$oldhostname\$" qa_hosts >/dev/null
then
    _notrun "$oldhostname is too precious to change hostname"
    # NOTREACHED
fi

# need systemd's autorestart to get pmlogger restarted quickly
# ... cron-based restart takes too long for a QA test
#
which systemctl >/dev/null 2>&1 || _notrun "no systemd"

# wait until the primary pmlogger's pid is != $1
#
# In the default/zeroconf setup, PMLOGGER_INTERVAL=10 and there are
# lots of metrics logged at the default ... pmlogger only sees
# PMCD_HOSTNAME_CHANGE from the pmFetch(), so it could be up to 10
# seconds before the old pmlogger notices it has to exit, then
# systemd needs time to notice and restart pmlogger
#
_wait_for_new_pmlogger()
{
    echo "_wait_for_new_pmlogger previous PID $1 ..." >>$seq.full
    $PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p]mlogger' >>$seq.full
    max_delay=12
    i=0
    while [ $i -lt $max_delay ]
    do
	pid=`cat $PCP_RUN_DIR/pmlogger.pid 2>/dev/null`
	[ -n "$pid" -a "$pid" != "$1" ] && break
	sleep 1
	i=`expr $i + 1`
    done
    if [ -n "$pid" -a "$pid" != "$1" ]
    then
	echo "found PID $pid (after ${i}s) ..." >>$seq.full
	$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p]mlogger' >>$seq.full
    else
	echo "Failed!" >>$seq.full
	echo "Arrgh: failed to see new pmlogger after $max_delay seconds"
	ls -l $PCP_RUN_DIR/pmlogger.pid
	ls -l $PCP_TMP_DIR/pmlogger
	$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p]mlogger'
	status=1
    fi
}

# wait until the primary pmie's pid is != $1
#
# In pmie's config.default, the most frequent rule evaluation is once
# every 2 mins (sigh) ... pmie only sees PMCD_HOSTNAME_CHANGE from the
# pmFetch(), so it could be up to 120 seconds before the old pmie notices
# it has to exit, then systemd needs time to notice and restart pmie, but
# in addition, pmie's scheduler is sometimes a bit slow so add another
# 10 secs for all of that
#
_wait_for_new_pmie()
{
    echo "_wait_for_new_pmie previous PID $1" >>$seq.full
    $PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p]mie' >>$seq.full
    max_delay=130
    i=0
    while [ $i -lt $max_delay ]
    do
	pid=`cat $PCP_RUN_DIR/pmie.pid 2>/dev/null`
	[ -n "$pid" -a "$pid" != "$1" ] && break
	sleep 1
	i=`expr $i + 1`
    done
    if [ -n "$pid" -a "$pid" != "$1" ]
    then
	echo "found PID $pid (after ${i}s) ..." >>$seq.full
	$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p]mie' >>$seq.full
    else
	echo "Failed!" >>$seq.full
	echo "Arrgh: failed to see new pmie after $max_delay seconds"
	ls -l $PCP_RUN_DIR/pmie.pid
	ls -l $PCP_TMP_DIR/pmie
	$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p]mie'
	$sudo -u pcp kill -USR1 "$1"
	sleep 1
	echo "=== $oldhostname log ===" >>$seq.full
	cat $PCP_LOG_DIR/pmie/$oldhostname/pmie.log >>$seq.full
	echo "=== $newhostname log ===" >>$seq.full
	cat $PCP_LOG_DIR/pmie/$newhostname/pmie.log >>$seq.full
	status=1
    fi
}

# need primary pmlogger running
#
if [ -f $PCP_RUN_DIR/pmlogger.pid ]
then
    oldlogger=`cat $PCP_RUN_DIR/pmlogger.pid`
    echo "old pmlogger pid: $oldlogger" >>$seq.full
else
    _notrun "$PCP_RUN_DIR/pmlogger.pid not found for old primary pmlogger"
    # NOTREACHED
fi

# need primary pmie running
#
pminfo -f pmcd.pmie.pmcd_host >$tmp.pminfo
if grep "0 or \"primary\"" <$tmp.pminfo >/dev/null
then
    :
else
    cat $tmp.pminfo >>$seq.full
    _notrun "primary pmie not running"
    # NOTREACHED
fi

if [ -f $PCP_RUN_DIR/pmie.pid ]
then
    oldpmie=`cat $PCP_RUN_DIR/pmie.pid`
    echo "old pmie pid: $oldpmie" >>$seq.full
else
    _notrun "$PCP_RUN_DIR/pmie.pid not found for old primary pmie"
    # NOTREACHED
fi

newlogger=0
newpmie=0

_cleanup()
{
    cd $here
    if [ "`hostname`" != "$oldhostname" ]
    then
	$sudo hostname "$oldhostname" 2>>$seq.full
	_wait_for_new_pmlogger "$newlogger"
	_wait_for_new_pmie "$newpmie"
    fi
    for dir in boofa-$seq
    do
	$sudo rm -rf "$PCP_ARCHIVE_DIR/$dir"
    done
    $sudo rm -rf $tmp $tmp.*
}

status=0	# success is the default!
$sudo rm -rf $tmp $tmp.* $seq.full
trap "_cleanup; exit \$status" 0 1 2 3 15

_filter()
{
    sed \
	-e "s@$tmp@TMP@g" \
	-e "s@$PCP_ARCHIVE_DIR@PCP_ARCHIVE_DIR@" \
	-e "s@$PCP_LOG_DIR@PCP_LOG_DIR@" \
	-e "s@$PCP_VAR_DIR@PCP_VAR_DIR@" \
	-e "s/$oldhostname/OLDHOSTNAME/g" \
	-e "s/$newhostname/NEWHOSTNAME/g" \
	-e "s@/[0-9][0-9.-]*@/DATESTAMP@" \
    # end
}

# real QA test starts here
echo "old pmlogger ..."
if [ ! -f $PCP_TMP_DIR/pmlogger/primary ]
then
    echo "Arrgh: no tracking file in $PCP_TMP_DIR for primary pmlogger"
    ls -l $PCP_TMP_DIR/pmlogger
    status=1
    exit
fi
_filter <$PCP_TMP_DIR/pmlogger/primary \
| sed \
    -e 's/^reexec$/pmlogger_check|reexec/' \
    -e 's/^pmlogger_check$/pmlogger_check|reexec/' \
    # end

echo
echo "old pmie ..."
if [ ! -f $PCP_TMP_DIR/pmie/$oldpmie ]
then
    echo "Arrgh: no tracking file in $PCP_TMP_DIR for old primary pmie"
    ls -l $PCP_TMP_DIR/pmie
    status=1
    exit
fi
$PCP_BINADM_DIR/pmiestatus $PCP_TMP_DIR/pmie/$oldpmie \
| _filter

echo
echo "Change hostname ..."
$sudo hostname "$newhostname" 2>>$seq.full
_wait_for_new_pmlogger "$oldlogger"

echo
echo "new pmlogger ..."
if [ ! -f $PCP_TMP_DIR/pmlogger/primary ]
then
    echo "Arrgh: no tracking file in $PCP_TMP_DIR for primary pmlogger"
    ls -l $PCP_TMP_DIR/pmlogger
    status=1
    exit
fi
_filter <$PCP_TMP_DIR/pmlogger/primary

if [ -f $PCP_RUN_DIR/pmlogger.pid ]
then
    newlogger=`cat $PCP_RUN_DIR/pmlogger.pid`
    echo "current pmlogger PID $newlogger" >>$seq.full
else
    echo "$PCP_RUN_DIR/pmlogger.pid not found for new primary pmlogger"
    status=1
    exit
fi

echo
echo "new pmie ..."
_wait_for_new_pmie "$oldpmie"

if [ -f $PCP_RUN_DIR/pmie.pid ]
then
    newpmie=`cat $PCP_RUN_DIR/pmie.pid`
    echo "current pmie PID $newpmie" >>$seq.full
else
    echo "$PCP_RUN_DIR/pmie.pid not found for new primary pmie"
    status=1
    exit
fi

if [ ! -f $PCP_TMP_DIR/pmie/$newpmie ]
then
    echo "Arrgh: no tracking file in $PCP_TMP_DIR for new primary pmie"
    ls -l $PCP_TMP_DIR/pmie
    status=1
    exit
fi
$PCP_BINADM_DIR/pmiestatus $PCP_TMP_DIR/pmie/$newpmie \
| _filter

# success, all done
exit
