#!/bin/bash
#set -vx
set -e

LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
. $LUSTRE/tests/test-framework.sh
init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}

racer=`which racer.sh`
[ -z "$racer" ] && echo racer is not installed && exit 1

CLIENTS=${CLIENTS:-$HOSTNAME}
RDIR=$DIR/racer
mkdir -p $RDIR
DURATION=${DURATION:-120}

assert_env CLIENTS

timer_on () {
	sleep $1 && kill -s ALRM $$ &
    	TIMERPID=$!
    	echo TIMERPID=$TIMERPID
}

do_racer_cleanup () {
	trap 0

	local WAIT=0
	local INTERVAL=5
        local pids
	local rc=0

	echo "DOING RACER CLEANUP ... "

	# Check if all processes are killed

	local clients=$CLIENTS

	# 1.Let chance to racer to kill all it's processes
	# FIXME: not sure how long does it take for racer to kill all processes
	# 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec
	while [ $WAIT -lt 90 ]; do
		running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true)
		[ -z "$running" ] && rc=0 && break
		echo "clients $clients are still running the racer processes. Waited $WAIT secs"
		echo $running
		rc=1
		[ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL))
		sleep $INTERVAL
		WAIT=$((WAIT + INTERVAL))
	done

	# 2. Kill the remaining processes
	if [ $rc -ne 0 ]; then
		for C in ${clients//,/ } ; do
			pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true)
			if [ ! -z "$pids" ]; then
				echo "client $C still running racer processes after $WAIT seconds. Killing $pids"
				do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)"
				do_node $C kill -TERM $pids || true
				# let processes to be killed
				sleep 2
	# 3. Check if the processes were killed
	# exit error if the processes still exist
				for pid in $pids; do
					do_node $C "ps -P $pid" && RC=1 || true
				done
			else
				echo "All processes on client $C exited after $WAIT seconds. OK."
			fi
		done
	else
		echo "No racer processes running after $WAIT seconds. OK."
		wait_remote_prog $racer 10
	fi
}

racer_cleanup () {
	if [ "$timeout" == "timeout" ]; then
		echo $timeout killing RACERPID=$RACERPID
		kill $RACERPID || true
		sleep 2	# give chance racer to kill it's processes
		do_racer_cleanup
	else
		echo "Racer completed before DURATION=$DURATION expired. Cleaning up..."
		kill $TIMERPID
		do_racer_cleanup
	fi
}

racer_timeout () {
	timeout="timeout"
	racer_cleanup
	echo "$0: completed $RC"
	exit $RC
}

# run racer
log "Start racer on clients: $CLIENTS DURATION=$DURATION"
RC=0

trap racer_timeout ALRM

timer_on $((DURATION + 5))

do_nodes $CLIENTS "DURATION=$DURATION $racer $RDIR" &
RACERPID=$!
echo RACERPID=$RACERPID
wait $RACERPID || RC=2
racer_cleanup
echo "$0: completed $RC"
exit $RC
