#!/usr/bin/ruby

# A command and control bot to manage a number of puppet daemons
# via the puppet agent for mcollective.  This daemon will ensure that
# puppet daemons run at a set interval and it will control the concurrency
# across the cluster as a whole.
#
# The object is to make best use of the puppetmaster resources and to provide
# a nice predictable utilization graphs of a master to assist in capacity planning.
#
# We also give priority to the Systems Administrator, if she runs an interactive
# puppetd --test the background ones will back off - assuming her run triggers
# the concurrency threshold.
#
# The concepts here was first introduced on a blog post below:
#
#    http://www.devco.net/archives/2010/03/17/scheduling_puppet_with_mcollective.php
#
# This code is released under the Apache 2 licence.
#
# See http://mcollective-plugins.googlecode.com/ for more information.


require 'mcollective'
require 'yaml'
require 'pp'
require 'logger'

include MCollective::RPC

# Precedence order of configs are:
# - defaults set here
# - settings sotred in the config file
# - settings passed on the command line
#
@config = { :interval => 30,
            :concurrency => false,
            :logfile => "/dev/null",
            :filter => "",
            :randomize => false,
            :daemonize => false}

if File.exist?("/etc/puppetcommander.cfg")
    begin
        config = YAML::load_file('/etc/puppetcommander.cfg')
        if config.keys == @config.keys
            @config = YAML::load_file('/etc/puppetcommander.cfg')
        else
            raise("Could not parse config, not all options given")
        end
    rescue Exception => e
        puts "Failed to load config file /etc/puppetcommander.cfg: #{e}"
        exit 1
    end
end

@options = rpcoptions do |parser, options|
    parser.define_head "Command and Control agent to schedule puppet runs in a cluster"

    parser.on('--interval [MINUTES]', '-i', 'Interval to run clients at') do |v|
        @config[:interval] = v.to_i
    end

    parser.on('--max-concurrent [ACTIVE]', '-m [ACTIVE]', 'Maximum run concurrency to allow') do |v|
        @config[:concurrency] = v.to_i
    end

    parser.on('--logfile [LOGFILE]', '-l [LOGFILE]', 'Log file to write') do |v|
        @config[:logfile] = v
    end

    parser.on("--daemonize", "-d", "Daemonizes the script") do |v|
        @config[:daemonize] = true
    end
end

@logger = Logger.new(@config[:logfile])

def log(msg)
    @logger.add(Logger::INFO) { msg }
rescue Exception => e
    puts "Could not log '#{msg}': #{e}"
end

def debug(msg)
    if @options[:verbose]
        @logger.add(Logger::DEBUG) { msg }
    end

rescue Exception => e
    puts "Could not log '#{msg}': #{e}"
end

# Does a SimpleRPC call and calculates the total amount of
# puppet daemons that are currently running a catalog
def concurrent_count
    debug("Getting puppet status")

    running = 0

    @puppet.status do |resp|
        begin
            running += resp[:body][:data][:running].to_i
        rescue Exception => e
            debug("Failed to get node status: #{e}, continuing")
        end
    end

    running
end

# This sends a request to a specific client only.
def run_client(client)
    log("Running agent for #{client}")

    @puppet.custom_request("runonce", {:forcerun => true}, client, {"identity" => client})
end

# This is the beef of things, we take a desired interval and
# maximum allowed concurrency and basically performs the following
# pseudo actions:
#
# - count the clients that match the supplied filter
# - figure out a desired sleep interval to run the number of clients
#   in the supplied maximum interval
# - optionally shuffle the list of nodes based on the :randomize config option
# - traverse the list of discovered clients alphabetically and does a
#   run of each one as long as the concurrency is below the limit.
# - after running the node we sleep for the remaining time of the sleep interval
# - once we've run all nodes, we rediscover and run again - this will pick
#   up new nodes and recover gracefully from network outages and such.
#
# TODO: Add exception handling.
def run(interval, concurrency)
    @puppet.reset

    log("Doing discovery start of run")
    Timeout::timeout(5) {
        @clients = @puppet.discover :verbose => false
    }
    log("Discovered #{@puppet.discover.size} nodes")

    unless @clients.nil? or @clients.size == 0
        sleeptime = interval * 60 / @clients.size

        log("Found #{@clients.size} puppet nodes, sleeping for ~#{sleeptime} seconds between runs")

        if @config[:randomize]
            @clients = @clients.sort_by { rand }
        else
            @clients.sort!
        end

        @clients.each do |client|
            starttime = Time.now.to_i

            begin
                cur_concurrency = concurrent_count
                log("Current puppetd's running: #{cur_concurrency}")

                if concurrency
                    if cur_concurrency < concurrency
                        run_client(client)
                    else
                        log("Puppet run for client #{client} skipped due to current concurrency of #{cur_concurrency}")
                    end
                else
                    run_client(client)
                end
            rescue Exception => e
                log("Runner raised an exception for client #{client}: #{e}")
                log(e)
            ensure
                sleeptime = (interval * 60 / @clients.size) - (Time.now.to_i - starttime)
                log("Sleeping for #{sleeptime} seconds")

                sleep(sleeptime > 0 ? sleeptime : 1)
            end
        end
    else
        log("No Puppet clients found.")
    end
end

def connect
    @puppet = rpcclient("puppetd", :options => @options)
    @puppet.progress = false

    # do some naive parsing of the filter, this is pretty poor and should be set using
    # the new in 1.1.0 MCOLLECTIVE_EXTRA_OPTS setting
    if @puppet.options[:filter]["fact"] == [] || @puppet.options[:filter]["cf_class"] == []
        @config[:filter].split(" ").each do |f|
            if f =~ /^(.+?)=(.+)/
                @puppet.fact_filter $1, $2
            else
                @puppet.class_filter f
            end
        end
    end
end

# starts the worker loop in the foreground
def foreground
    connect
    loop do
      run(@config[:interval], @config[:concurrency])
    end
end

# starts the worker loop in the background
def background
    pid = fork do
        Process.setsid
        exit if fork

        Dir.chdir('/tmp')
        STDIN.reopen('/dev/null')
        STDOUT.reopen('/dev/null', 'a')
        STDERR.reopen('/dev/null', 'a')

        File.open("/var/run/puppetcommander.pid", "w") do |pidfile|
            pidfile.puts $$
        end

        connect
        loop do
            begin
                log("Starting run: interval: #{@config[:interval]} concurrency: #{@config[:concurrency]}")
                run(@config[:interval], @config[:concurrency])
                log("Run ended, restarting")
            rescue Exception => e
                log("Runner raised an exception: #{e}")
                log(e)
                sleep 5
                retry
            end
        end
    end

    Process.detach(pid)
end

log("Looping clients with an interval of #{@config[:interval]} minutes")
log("Restricting to #{@config[:concurrency]} concurrent puppet runs") if @config[:concurrency]

["TERM", "HUP"].each do |sig|
    Signal.trap(sig) do
        exit!
    end
end

if @config[:daemonize]
    background
else
    foreground
end
