
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <asm/atomic.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <syslog.h>
#include <signal.h>    // for kill
#include <netinet/in.h>
#include "lock.h"
#include "list.h"
#include "hash.h"
#include "select.h"

#define MY_NAME "nbd-shmem"

#include "cliserv.h"
#include "shmem.h"
#include "time.h"

//#undef DEBUG
//#define DEBUG 1
#if DEBUG > 0
  #undef DEBUG
  #define DEBUG(s...) fprintf(stderr,s)
#else
  #undef DEBUG
  #define DEBUG(s...) 
#endif

/*
 *  Read the seqno on disk and if our sewno is greater, then update the
 *  number with our own. Do We need a lock for this? 
 * */
static void
maybe_update_seqno_ondisk (struct nbd_shmem *data, unsigned seqno)
{
        unsigned data0;
        static char c;

	if (data == NULL)
		return;

        // restrict incoming seqno to 24 bits
        seqno &= (1<<24) - 1;

        data->lock.down(&data->lock);

        data0 = data->seqno;

	if (data0 < seqno || seqno + (1<<23) < data0) {
                if (seqno > data0+1 && data0 > 0) {
                        PERR ("server writes req seqno %d (ondisk +%d)\n",
                            seqno, seqno - data0);
                }
		data->seqno = seqno;
                mygettimeofday(&data->tv,NULL);
                // signal
                if (!data->err) {
                    if (write(data->fd[0],&c,1) > 0) {
                        data->signals++;
                    }
                }
        } else {
                DEBUG ("server does NOT write req seqno %d (ondisk +%d)\n",
                            seqno, seqno - data0);
        }
        // up spinlock
        data->lock.up(&data->lock);
}

#define timer_add(tv,usec) \
        do { \
                struct timeval * _tv = tv; \
                long _carry  = (_tv->tv_usec += usec) / 1000000; \
                _tv->tv_sec  += _carry; \
                _tv->tv_usec -= _carry * 1000000; \
        } while (0)

  static int
  try_seqno_arrived (struct nbd_shmem *data, unsigned seqno, unsigned mdelay) {

        unsigned data0 = 0;
        struct timeval tv0, tv1;
        char c;

        // restrict incoming seqno to 24 bits as thats atomic_t
        seqno &= (1<<24) - 1;

        mygettimeofday(&tv1, NULL);

        data->lock.down(&data->lock);
        // soak up any pending notifications
        if (!data->err) {
            while (data->signals>0) {
                if (read(data->fd[1], &c, 1) > 0) {
                    data->signals--;
                }
            }
        }
        data0 = data->seqno;
        tv0   = data->tv;
        data->lock.up(&data->lock);

        // PTB cope with generations rather brusquely
                
	if (data0+1 >= seqno || data0+1 >= (1<<23) + seqno || data0 <= 0) {
                        return 0;
	}

        timer_add(&tv0, mdelay*1000*10);

        if (timercmp(&tv0,&tv1,<)) {
                // PTB DIsk data is old so give up with success
                return 0;
        }

        return -ETIME;
  }     


/*
 * We wait for the seqno on disk until it catches up with our sequence number
 * and then release when ours is only one more than it. We timeout too
 * ..
 * */
  static int
  maybe_wait_for_seqno (struct nbd_shmem *data, unsigned seqno, unsigned mdelay) {

	const int udelay = 10000;	// 10ms between waits for seqno
        fd_set rfds;
        struct timeval tv0;

        mygettimeofday(&tv0, NULL);
        // modify to make tv into a limit
        timer_add(&tv0, mdelay * 1000);

	while (1) {

                struct timeval tv1;

                if (try_seqno_arrived(data, seqno, mdelay) == 0) {
                    return 0;
                }

                // PTB we are beyond our timeout
                if (mygettimeofday(&tv1,NULL) < 0)
                    return -EINVAL;
                if (timercmp(&tv1,&tv0,>))
                    return -ETIME;

                // PTB wait for signal or timeout
                if (!data->err) {
                    FD_ZERO(&rfds);
                    FD_SET(data->fd[1], &rfds);
		    microselect (data->fd[1]+1, &rfds, NULL, NULL, udelay);
                } else {
		    microsleep (udelay);
                }

	}

  }     

  struct shmem_hash_key {
      unsigned seqno;
      u32      handle;
      unsigned dummy[0] __attribute__ ((aligned(sizeof(struct list_head))));  // to give space for linking
  };
  struct shmem_hash_data {
      struct timeval timeout;
      int pid;
      int state;
      unsigned dummy[0] __attribute__ ((aligned(sizeof(struct list_head))));  // to give space for linking
  };

  static int
  lock_req_dummy(struct nbd_shmem *data, u32 handle, unsigned seqno, unsigned mtimeout) {
      return 0;
  }

  static int
  hash_data_is_old (void *d)
  {
      struct shmem_hash_data *c = (struct shmem_hash_data *) d;
      struct timeval tv;
      mygettimeofday(&tv, NULL);
      if ((c->state & NBD_SHMEM_REQ_DONE)
	    || timercmp (&c->timeout, &tv, >=)) {
		return 1;
      }
      return 0;
  }

  static int
  lock_req(struct nbd_shmem *data, u32 handle, unsigned seqno, unsigned mtimeout) {

      int err;
      struct nbd_hash * hash = &data->hash;

      struct shmem_hash_data content;
      struct shmem_hash_key key = { seqno: seqno, handle: handle, };

      struct hash_datum hdata;
      struct hash_datum hkey;

      struct timeval tv;

      mygettimeofday(&tv, NULL);

      hkey.dsize = sizeof(key);
      hkey.dptr  = (char *)&key;

      data->lock.down(&data->lock);
      DEBUG("fetch for key %d\n", seqno);
      hdata = hash->fetch(hash, hkey);

      if (hdata.dptr) {
          // had an entry - read it.
          struct shmem_hash_data * c = (struct shmem_hash_data *)hdata.dptr;
          if (c->state & NBD_SHMEM_REQ_DONE) {
              // done already!
              PERR("key %d has already been handled OK!\n", seqno);
              data->lock.up(&data->lock);
              return c->state & NBD_SHMEM_REQ_ERROR ? -EINVAL : -EALREADY;
          }
          if (timercmp(&c->timeout, &tv, >)) {
               // still valid time
               if (kill(c->pid, 0) < 0) {
                   // owned by valid pid
                   PERR("key %d is still being handled by pid %d!\n",
                           seqno, c->pid);
                   data->lock.up(&data->lock);
                   return -EINPROGRESS;
               } else {
                   PERR("req %d being handled by stale pid %d, taking it ...\n",
                           seqno, c->pid);
                   // drop through
               }
          } else {
              PERR("req %d is being handled by slow pid %d, taking it on ...\n",
                      seqno, c->pid);
              // drop through
          }
      } else {
          DEBUG("req %d is not yet being handled, taking it on ...\n", seqno);
      }

      // PTB set the timeout
      tv.tv_usec += 1000 * mtimeout;
      if (tv.tv_usec >= 1000000) {
          int carry = tv.tv_usec / 1000000;
          tv.tv_usec -= carry * 1000000;
          tv.tv_sec  += carry;
      }
      content.timeout = tv;
      content.pid = getpid();
      content.state = NBD_SHMEM_REQ_STARTED;

      // we have the right to make it ours
      hdata.dsize = sizeof(content);
      hdata.dptr  = (char *)&content;

      DEBUG("store state 1 (being handled) for key %d\n", seqno);
      err = hash->store(hash, hkey, hdata, NBD_HASH_REPLACE);
      if (err < 0) {
          PERR("store for key %d failed with %d entries in cache\n",
                  seqno, hash->count);
      } 
      return err;
  }

  static int
  unlock_req_dummy(struct nbd_shmem *data, u32 handle, unsigned seqno) {
      return 0;
  }

  static int
  unlock_req(struct nbd_shmem *data, u32 handle, unsigned seqno) {

      int err;
      struct nbd_hash * hash = &data->hash;
      struct shmem_hash_data content;
      struct shmem_hash_key  key = { seqno: seqno, handle: handle, };
      struct hash_datum hdata;
      struct hash_datum hkey;
      struct timeval tv;
      struct shmem_hash_data * c;
      int pid;

      mygettimeofday(&tv, NULL);

      hkey.dsize = sizeof(key);
      hkey.dptr  = (char *)&key;

      data->lock.down(&data->lock);
      DEBUG("fetch for key %d\n", seqno);
      hdata = hash->fetch(hash, hkey);

      if (!hdata.dptr) {
          // didn't have an entry! Must have left the cache.
          data->lock.up(&data->lock);
          return 0;
      }

      // had an entry - read it.
      c = (struct shmem_hash_data *)hdata.dptr;
      if (c->state & NBD_SHMEM_REQ_DONE) {
          // done already!
          data->lock.up(&data->lock);
          PERR("somebody already handled req %d OK, erroring req!\n", seqno);
          return -EALREADY;
      }
      pid = getpid();
      if (c->pid != pid) {
          // we're not handling it. Is it in time still?
          if (timercmp(&c->timeout, &tv, <)) {
              // it is. Is the owner still alive?
              if (kill(c->pid, 0) < 0) {
                  // it is. Abandon
                  data->lock.up(&data->lock);
                  PERR("req %d being handled by pid %d, erroring req!\n",
                          seqno, c->pid);
                  return -EINPROGRESS;
              } else {
                  PERR("req %d being handled by stale pid %d, taking over ...!\n",
                          seqno, c->pid);
              }
          } else {
              PERR("req %d being handled by slow pid %d, taking over ...!\n",
                          seqno, c->pid);
          }
      }

      // we get to say we did it.
      content = *c;
      content.timeout = tv;
      content.pid = pid;
      content.state = NBD_SHMEM_REQ_DONE;

      hdata.dsize = sizeof(content);
      hdata.dptr  = (char *)&content;

      DEBUG("storing state 0 (done) for req %d\n", seqno);
      err = hash->store(hash, hkey, hdata, NBD_HASH_REPLACE);
      data->lock.up(&data->lock);
      return err;
  }

  void
  init_shmem(struct nbd_shmem *m, int size, int mfd, int off, int cache_lim) {
      int offset, dsize;
      init_lock(&m->lock, mfd, off + ((char *)&m->lock - (char *)m));
      m->size = size;
      m->mfd  = mfd;
      m->off  = off;
      m->seqno = 0;
      m->flags = 0;
      m->wait_seqno_timeout = maybe_wait_for_seqno;
      m->update_seqno = maybe_update_seqno_ondisk;
      m->err = socketpair(PF_UNIX,SOCK_STREAM,0,m->fd);
      if (!m->err) {
          fcntl(m->fd[0], F_SETFL, O_NONBLOCK);
          fcntl(m->fd[1], F_SETFL, O_NONBLOCK);
      }
      if (cache_lim > 0) {
          // purely for information
          m->flags |= NBD_SHMEM_CACHE;
      }
      // this is where the hashdata area starts
      offset = - ((char *) m - (char *)&m->hash);
      dsize  =  size - offset;
      dsize  -= 128 * (2 * sizeof(char*)); // hash buckets
      init_hash(&m->hash, dsize, mfd, off + ((char *)&m->hash - (char *)m),
                  cache_lim, NBD_HASH_MODE_LIMIT);
      m->hash.register_shrink_test(&m->hash, hash_data_is_old);
      if (cache_lim > 0) {
          m->lock_req = lock_req;
          m->unlock_req = unlock_req;
      } else {
          m->lock_req = lock_req_dummy;
          m->unlock_req = unlock_req_dummy;
      }
  }

