/*
 * (Enhanced) Network block device - make block devices work over TCP
 *
 * Original NBD Copyright 1997 Pavel Machek <pavel@elf.mj.gts.cz>
 * Further ENBD Copyrights 1998, 1999, 2000 Peter Breuer <ptb@it.uc3m.es>
 *
 *
 *
 * ATTENTION: You need the userspace daemons available from
 *            ftp://oboe.it.uc3m.es/pub/Programs/nbd-2.4.*.tgz
 *            and/or the ENBD project on http://freshmeat.net
 *
 *
 *
 * Development of the ENBD software has been supported by grants and
 * contributions from Realm Information Technologies, Inc. of 5555
 * Oakbrook Parkway, NW Norcross, GA and iNsu Innovations Inc.  of
 * 3465, Boulevard Thimens, Saint-Laurent, Quebec, Canada.
 * 
 * ------------ Pavel's history notes ----------------------------------
 * 97-3-25 compiled 0-th version, not yet tested it 
 *   (it did not work, BTW) (later that day) HEY! it works!
 *   (bit later) hmm, not that much... 2:00am next day:
 *   yes, it works, but it gives something like 50kB/sec
 * 97-3-28 it's completely strange - when using 1024 byte "packets"
 *   it gives 50kB/sec and CPU idle; with 2048 bytes it gives
 *   500kB/sec (and CPU loaded 100% as it should be) (all done
 *   against localhost)
 * 97-4-1 complete rewrite to make it possible for many requests at 
 *   once to be processed
 * 97-4-1 23:57 rewrite once again to make it work :-(
 * 97-4-3 00:02 hmm, it does not work.
 * 97-4-3 23:06 hmm, it will need one more rewrite :-)
 * 97-4-10 It looks like it's working and stable. But I still do not
 *  have any recovery from lost connection...
 * (setq tab-width 4)
 * 97-4-11 Making protocol independent of endianity etc.
 * 97-4-15 Probably one more rewrite, since it loses requests under
 *  heavy loads
 * 97-9-13 Cosmetic changes
 *
 * possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall
 * why not: would need verify_area and friends, would share yet another 
 *          structure with userland
 *
 * FIXME: not module-safe
 *
 * ------------ Peter's history notes ----------------------------------
 * 98-12-18 modules now OK ptb@it.uc3m.es (Peter Breuer) ported to
 * 2.0.*. + better debugging. Still possible lockup in connection with APM
 * and spurious interrupt - only on write. Error treatment should
 * be improved. After 100 errors from end_request the kernel can
 * do anything. We should catch it ourselves.
 * 99-1-sometime fixed lockup by extending semaphore - ptb v1.0.
 * 99-3-sometime reconnect protocol (client mod agreed by pavel) - ptb v1.1
 * 99-4-25 add /proc/nbdinfo - ptb v1.1.1
 * 99-4-sometime add multiplex - ptb v1.2
 * 99-4-26 fix multiplex and redundancy - ptb v1.2.1
 * 99-4-29 reentrant client threads - ptb v1.2.2
 * 99-4-29 socket related stuff placed in user space - amarin v1.3.0
 * 99-5-3  fix all, all writes had to be before all reads - ptb v1.2.4
 * 99-5-5  fix out-of-order, async - ptb v1.2.5
 * 99-5-7  semaphores removed (still works!), fail cases corrected - ptb v1.2.6
 * 99-5-12 signals unblocked in xmit, blksize != 1024 fixed, ioctls
 *         added  - ptb v1.2.7
 * 99-6-1  interaction with client split into two functions - amarin v1.3.0
 * 99-6-3  reintegrated fully, mem manager fixed, accounts fixed - ptb v1.2.8.3
 * 99-6-3  extra queue removed, mem manager removed  - ptb v1.2.8.4
 * 99-7-3  buffer registration introduced - ptb v1.2.8.5
 * 99-7-3  some client redundancy reestablished - ptb v2.1.1
 * 99-7-10 encapsulated queue calls. One element rollback buffer - ptb v2.1.2
 * 99-7-20 timestamp and rollback old abandoned request - ptb v2.1.3
 * 99-7-24 64bit file sizes and offsets accepted - ptb v2.1.9
 * 99-7-26 experimental request coalesces - ptb v2.1.10
 * 99-7-27 partitioning scheme - ptb v2.2.1
 * 99-8-3  enbd_clr_sock bug in invalidate_device fixed? - ptb v2.2.4
 * 99-8-5  reverse replace of block_fsync, add sig ioctls - ptb v2.2.5
 *         reverse bug introduced about v2.2.3 for compound reqs - ptb v2.2.5
 *         fix clear_que bug (didn't rollback first) from 2.1.3 - ptb v2.2.5
 * 99-8-22 workaround strange nr_sectors bug - ptb v2.2.6
 * 99-8-11 fix MY_NBD_SYNC bug. Never sync'ed all - ptb v2.2.7
 * 99-8-12 wakeups all moved to enqueue - ptb v2.2.7
 * 99-8-23 remove slot->cli_age - ptb v2.2.7
 * 99-8-24 first 8 bytes of signature embedded in packets - ptb v2.2.8
 *         fix SET_SIG define buglet, remove hardcoded constants - ptb v2.2.8
 *         fix huge bug. Missing copy_fromfs in my_nbd_ack - ptb v2.2.8     
 *         removed signature embedding and all other decorations - ptb v2.2.8
 * 99-8-25 recast fix in my_nbd_ack to avoid align. bug - ptb v2.2.9
 *         put in MKDEVs and put back some hardcode const fixes - ptb v2.2.10
 * 99-9-29 fix BLKGETSIZE bug - ptb v2.2.14
 * 99-10-2 run with interrupts on throughout. Think we lose some - ptb v2.2.15
 * 99-10-8 trim dead code, kernel 2.2 ifdef's - ptb v2.2.17
 * 99-12-18 further o-o - ptb v2.2.19
 * 99-12-28 queue account cleanup. endio on queue reqs at reset - ptb v2.2.20
 *          interruptible semaphores for better client recovery - ptb v2.2.20
 * 00-1-2   debugging cleanups. Fix race in end_request - ptb v2.2.21
 * 00-1-4   semaphores simplified. - ptb v2.2.22
 * 00-6-8   emergency control by write to proc - ptb v2.2.24
 * 00-7-20  ported to 2.4.0-test1. Possible minor bugs found/fixed - ptb v2.2.24
 * 00-7-27  changed proc i/f to read_proc from get_info in 2.2/2.4 - ptb v2.2.25
 * 00-7-30  fixed reads before writes under 2.4 by disabling merge - ptb v2.2.25
 * 00-7-30  and fixed merge_reqs for 2.4, now that I understand! - ptb v2.2.25
 * 00-7-30  fixed/introduced possible bug in end_io  for 2.2/2.4 - ptb v2.2.25
 * 00-7-30 added timeval/zone field in requests and replies - ptb v2.4.0
 * 00-7-30 fixed hitherto masked bug in read_stat in enbd_client - ptb v2.4.0
 * 00-7-30 added timeout to net writes in enbd_client - ptb v2.4.0
 * 00-8-20 display fix for devices over 2GB - ptb v2.4.5
 * 00-8-23 more 64 bit fixes + error out overrange requests- ptb v2.4.6/2.2.27
 * 00-8-31 add ENBD_ERR ioctl to error out slot request- ptb v2.4.9
 * 00-8-31 soften ENBD_SOFT_RESET so doesn't wreck protocol - ptb v2.4.9
 * 00-9-1  remove %L's from printfs. Kernel 2.2. doesn't - ptb v2.4.10/2.2.27
 * 00-9-6  add various state flags to help init order - ptb v2.4.11
 * 00-9-8  add checks for device initialised to set_sock - ptb v2.4.12
 * 00-9-17 en/disable device as aslot count goes through 0 - ptb v2.4.13/2.2.28
 * 00-9-21 split read/write dev req counts for accounting - ptb v2.4.14
 * 00-9-21 renamed sync_intvl to req_timeo - ptb v2.4.14
 * 00-9-21 made sync_intvl count write blocks - ptb v2.4.14
 * 00-9-22 repair enable after delayed disable when disabled - ptb v2.4.14
 * 00-9-22 include sync (nonblocking) after sync_intvl reqs - ptb v2.4.14
 * 00-9-25 disable sync (nonblocking) after sync_intvl reqs - ptb v2.4.14
 * 00-9-25 bundle invalidate_buffers in clr_sock - ptb v2.4.14
 * 00-10-20 implement req_timeo per device + ioctl (Wang Gang) - ptb v2.4.15
 * 00-10-20 add raid mode (Wang Gang) - ptb v2.4.15
 * 00-10-26 throttle in do_req  - ptb v2.4.15
 * 00-10-28 do set_sock on first open and clr_sock on last close - ptb v2.4.15
 * 00-11-01 make sync_intvl really sync - ptb v2.4.15
 * 00-11-14 rename throttle to plug, enbd_sync takes arg - ptb v2.4.17
 * 00-11-19 clr_sock errs req not rollback if show_errs & !aslot - ptb v2.4.17
 * 00-11-20 removed autodeadlock when disabled in do_req end_req - ptb v2.4.17
 * 00-11-21 make MY_NBD_SYNC only sync when sync_intvl > 0 - ptb v2.4.17
 * 00-12-24 make MY_NBD_GET_REQ use a timeout arg - ptb v2.4.18
 * 01-02-12 ported to 2.4.0 (works). do_nbd_request rewritten - ptb v2.4.20
 * 01-02-20 managed to get plugging and clustered read/writes OK - ptb v2.4.21
 * 01-02-21 eliminated slot->buflen for the time being - ptb v2.4.21
 * 01-02-27 added proper devfs support - ptb v2.4.22
 * 01-03-15 allowed more devices/in devfs, cleaned up endio - ptb v2.4.23
 * 01-03-15 added device letter (<= 3 chars) to struct-  - ptb v2.4.23
 * 01-03-15 added request size check to do_nbd_req - ptb v2.4.23
 * 01-03-15 increased MAX_SECTORS to 512 by default - ptb v2.4.23
 * 01-03-15 made major number a module parameter - ptb v2.4.23
 * 01-03-18 added max_sectors array - ptb v2.4.23
 * 01-03-23 added devfs links - ptb v2.4.23
 * 01-04-17 plugging always enabled for 2.4 kernels - ptb v2.4.24
 * 01-04-17 made SET_RO set_device_ro as well as set local flags - ptb v2.4.25
 * 01-04-28 impl SET_MD5SUM ioctl and proc support for md5sum - ptb v2.4.25
 * 01-04-29 added accounting for md5'd reqs - ptb v2.4.25
 * 01-07-29 added atomic protections for accounting - ptb v2.4.25
 * 01-08-01 fixed 2.4 smp bugs. Interrupts off in spinlocks - ptb v2.4.25
 * 01-08-01 removed all semaphores for spinlocks - ptb v2.4.25
 * 01-08-01 invalidate_buffers in clr_sock (req'd Rogier Wolff) - ptb v2.4.25
 * 01-08-02 fixed smp deadlock - end_that_request_first slept! ptb v2.4.26
 * 01-10-16 provisionally added error in device open when notenabled ptb v2.4.27
 * 01-10-18 added DIRTY flag to save on repeated invalidate_buffers ptb v2.4.27
 * 01-10-31 increment seqno_out before delivery, so really starts at 1  v2.4.27
 * 01-11-01 move zeroing of seqno in cmd field to nbe_end_req* ptb v2.4.27
 * 01-11-18 add speed calculation, dev fields, display in proc ptb v2.4.27
 * 01-11-20 modifications for compiling into monolithic kernel ptb v2.4.27
 * 01-12-06 clr requests before reenabling, not after, in enbd_enable ptb 2.4.27
 * 02-02-21 make enbd_rollback modal, absirbing enbd_error ptb 2.4.27
 * 02-08-08 added local BLKSSZGET (reject) and related ioctls ptb 2.4.30
 * 02-08-12 make enbd_ack not ruin req when its rolled back already ptb 2.4.30
 * 02-09-18 fix __FUNCTION__ for new gcc ptb 2.4.30
 * 02-09-18 always allow daemon death even with reqs waiting ptb 2.4.30
 * 02-09-18 eliminate SYNC_REQD, RLSE_REQD ptb 2.4.30
 * 02-09-18 eliminate speed_lim ptb 2.4.30
 * 02-09-18 fix countq accounting ptb 2.4.30
 * 02-09-18 encapsulate remote ioctl handling ptb 2.4.30
 * 02-09-18 remote ioctl uses kernel req, not our fake one ptb 2.4.30
 * 02-09-18 eliminated ctldta use (too much tricky logic) ptb 2.4.30
 * 02-09-28 handle req specials ptb 2.4.30
 * 02-10-10 introduce DIRECT flag ptb 2.4.30
 * 02-10-13 rollback pushes reqs to local queue, not queues them! ptb 2.4.30
 * 02-10-13 add hooks for separate ioctl module  ptb 2.4.30
 * 02-10-16 take set_sock out of open. Put pid check in handshake  ptb 2.4.30
 * 02-10-16 define MY_NBD_GET_NPORT ioctl ptb 2.4.30
 * 02-10-18 remove wait from MY_NBD_SYNC ioctl ptb 2.4.30
 * 02-10-20 rollback adds requests to queue in seqno order ptb 2.4.30
 * 02-10-23 introduce and use pid_sem instead of req_sem ptb 2.4.30
 * 02-10-30 support client fallback to ioctls on whole disk ptb 2.4.30
 * 02-11-3  moved set INITIALISED up to coincide with setting inode ptb 2.4.30
 * 02-11-3  add media check and revalidate routines ptb 2.4.30
 * 02-11-4  encapuslate lives++ and ENABLED changes into enbd_enable ptb 2.4.30
 * 02-11-4  set_enable from proc only enables, not clears queue ptb 2.4.30
 * 11-11-4  take blk_put_request out of end_request (it locks!) ptb 2.4.30
 * 11-11-4  replace list_del by list_del_init ptb 2.4.30
 * 02-12-7  enbd_release made aware of daemons on whole disk ptb 2.4.30
 * 03-01-7  added ioctls for setfaulty etc. ptb 2.4.31
 * 03-02-1  used metalock for non-queue changes ptb 2.4.31
 * 03-03-12 add md_list notification ioctls ptb 2.4.31
 */

#include <linux/major.h>
#ifndef UNIX98_PTY_MAJOR_COUNT
  #define UNIX98_PTY_MAJOR_COUNT 8
  #ifndef UNIX98_NR_MAJORS
    #define UNIX98_NR_MAJORS=UNIX98_PTY_MAJOR_COUNT
  #endif
#endif

#include <linux/module.h>

#if defined(__GNUC__) && __GNUC__ >= 2
#define _LOOSE_KERNEL_NAMES
#endif

#include <linux/version.h>

#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <asm/segment.h>

#include <asm/uaccess.h>	/* PTB - when did this arrive in kernel? */
#include <asm/byteorder.h>
#include <linux/wrapper.h>

#define MAJOR_NR NBD_MAJOR
static int major = MAJOR_NR;

#include <linux/proc_fs.h>
#include <linux/genhd.h>
#include <linux/hdreg.h>

#include <linux/file.h>		/* PTB - when did this arrive in kernel? */

#include <linux/smp_lock.h>

#include <linux/devfs_fs_kernel.h>

#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/kdev_t.h>
#include <linux/buffer_head.h>
#include <linux/completion.h>

/*                                                       *
 * PTB --------------- compatibility ------------------- *
 *                   layer starts here.                  *
 */

  /*
   * PTB BH_Protected disappeared somewhere around 2.4.10 but this is
   * still needed for the very rare write local/read remote mode. DOn't
   * worry about it in normal operation!
   */
#ifndef mark_buffer_protected
  #define mark_buffer_protected(rbh) \
      { \
  	mark_buffer_dirty (rbh); \
  	mark_buffer_uptodate (rbh, 1); \
  	refile_buffer (rbh); \
       }
#endif

  /* PTB list interface extensions */
#ifndef list_head
  #define list_head(ptr, type, member) \
  (list_empty(ptr)?NULL:list_entry(((struct list_head *)ptr)->next,type,member))
#endif
#ifndef list_tail
  #define list_tail(ptr, type, member) \
  (list_empty(ptr)?NULL:list_entry(((struct list_head *)ptr)->prev,type,member))
#endif

  /* PTB for arches without the atomic mask ops (and no smp, I think!)
   * - feel free to correct with assembler
   */
#ifndef atomic_set_mask
  #define atomic_set_mask(mask, x) (x)->counter |= (mask)
#endif
#ifndef atomic_clear_mask
  #define atomic_clear_mask(mask, x) (x)->counter &= ~(mask)
#endif

/*                                                       *
 * PTB --------------- compatibility ------------------- *
 *                   layer ENDS here.                    *
 */

int linux_version_code = LINUX_VERSION_CODE;

#include <linux/bio.h>
#include <linux/enbd.h>
#include <linux/enbd_ioctl.h>

/*
 * PTB kernel data - 4KB worth
 * We need space for nda, nda1, .. nda15, ndb, ndb1, ..
 * The index is exactly the minor number.
 */
  static int enbd_blksizes[MAX_NBD * ENBD_MAXCONN];
  static int enbd_sizes[MAX_NBD * ENBD_MAXCONN];
  static __u64 enbd_bytesizes[MAX_NBD * ENBD_MAXCONN];
  static int enbd_max_sectors[MAX_NBD * ENBD_MAXCONN];

/*
 * PTB our data   - about 3KB
 * These are nda, ndb, ndc, ...
 * Divide the minor by ENBD_MAXCONN to get this index.
 */
  static struct enbd_device enbd_dev[MAX_NBD];
  static spinlock_t enbd_lock = SPIN_LOCK_UNLOCKED;
  static struct enbd_md enbd_md;
  static struct enbd_ioctl_stub enbd_remote_ioctl;

  struct enbd_device * enbd_get(int i) {
      return &enbd_dev[i];
  }

  #define ENBD_FAIL( s ) { \
    ENBD_DEBUG(1, s " (result %d).\n" , result ); \
    goto error_out; \
  }
  #define ENBD_HARDFAIL( s ) { \
    ENBD_ERROR( s " (result %d).\n" , result ); \
    lo->harderror = result; \
    goto hard_error_out; \
  }

/*
 * PTB device parameters. These are module parameters too.
 */

  static int rahead     = ENBD_RAHEAD_DFLT;/* PTB - read ahead blocks  */
  static int sync_intvl = ENBD_SYNC_INTVL; /* PTB - sync every n secs/Kreqs */
  static int merge_requests               /* PTB - bool, do request coalesce */
                        = ENBD_MERGE_REQ_DFLT;
  static int buf_sectors = ENBD_MAX_SECTORS;
                                          /* PTB - user bufsize required */
  static int show_errs = 1;	          /* PTB - RAID mode? not usually */
  static int direct     = 0;              /* PTB - all opens are O_DIRECT  */
  static int plug = ENBD_PLUG_DFLT;

  static int md5sum = 0;		  /* PTB - use md5summing write proto */
  static int md5_on_threshold = 1000;	  /* PTB - reqs reqd to turn md5 on */
  static int md5_off_threshold = 10;	  /* PTB - errs reqd to turn md5 off */

#ifndef NO_BUFFERED_WRITES
  static int buffer_writes = 0;	          /* PTB - act like ramd on write */
#endif		/* NO_BUFFERED_WRITES */

#if defined(MODULE)
  MODULE_PARM (rahead, "i");
  MODULE_PARM (sync_intvl, "i");
  MODULE_PARM (merge_requests, "i");
  MODULE_PARM (buf_sectors, "i");
  MODULE_PARM (show_errs, "i");
  MODULE_PARM (direct,"i");
  #ifndef NO_BUFFERED_WRITES
    MODULE_PARM (buffer_writes, "i");
  #endif		/* NO_BUFFERED_WRITES */
  MODULE_PARM (major, "i");
  MODULE_PARM (md5sum, "i");
  MODULE_PARM (md5_on_threshold, "i");
  MODULE_PARM (md5_off_threshold, "i");
#endif

  // PTB This pointer is initialised in enbd_init.
  static struct request_queue * enbd_queue;

#define NO_BUFFERED_WRITES 1

/*                                                       *
 * PTB --------------- functions ----------------------- *
 */

/*
 * PTB 
 *       Decode the request type of a request and return it. DOn't we
 *       have anywhere else to put this? Yes, in private data. But
 *       that's just a pointer to our device data so we don't use it.
 *
 *       we use the low bit (REQ_RW) of the flags and the first high bit
 *       (REQ_NBD) to designate the type of request.
 *
 *       @req the request to get the type of.
 */

static int
rq_type (struct request *req)
{
        if (req->flags & REQ_SPECIAL)
                return SPECIAL;

        switch ( ((req->flags & REQ_RW) ?1:0)
               | ((req->flags & REQ_NBD)?2:0)
               ) {
            case 0:
                return READ;
            case 1:
                return WRITE;
            case 2:
                return IOCTL;
            case 3:
                return MD5SUM;
        }
        // PTB report what we can of the strangeness if it is strange
        return (req->flags < 4) ? -1: req->flags;
}

/*
 * PTB code the request type into a request.
 *
 * This appears to be only used when making an ioctl request and it
 * never really escapes from our private area and it doesn't matter too
 * much how efficient it is either.
 *
 * This function marks a request for conventional viewing as
 * being of the designated conceptual type. It correspomds to the old
 * "type" field in requests.
 *
 *  @req the request to set the type on
 *  @type one of READ, WRITE, etc.
 */
static void
set_rq_type (struct request *req, int type)
{
        switch (type) {
            case READ:
                req->flags &= ~(REQ_RW | REQ_NBD | REQ_SPECIAL);
                return;
            case WRITE:
                req->flags &= ~(REQ_NBD | REQ_SPECIAL);
                req->flags |= REQ_RW;
                return;
            case IOCTL:
                req->flags &= ~(REQ_RW | REQ_SPECIAL);
                req->flags |= REQ_NBD;
                return;
            case MD5SUM:
                req->flags &= ~REQ_SPECIAL;
                req->flags |= REQ_RW | REQ_NBD;
                return;
            case SPECIAL:
                req->flags |= REQ_RW | REQ_NBD | REQ_SPECIAL;
                return;
        }
}

/*
 * PTB count number of blocks in a request. This will be an overestimate
 * if the number is not an exact multiple. It seems to happen. We 
 * guarrantee to return -ve only if the request is invalid.
 *
 * @req - request we want to count
 */
inline long
nr_blks (struct request *req)
{
	unsigned log_sectors_per_blk;
	unsigned sectors_per_blk;
	int size;
	int sectors;
	struct enbd_device *lo;

	if (!req)
		return -EINVAL;

        if (rq_type(req) == REQ_SPECIAL) // PTB contains no data
                return 0;

        lo = req->rq_disk->private_data;

	log_sectors_per_blk = lo->logblksize - 9;
	sectors_per_blk = 1 << log_sectors_per_blk;

	sectors = req->nr_sectors;
	size = (sectors + sectors_per_blk - 1) >> log_sectors_per_blk;

	return size;
}

/*
 * return a temporary buffer containing the (1 or 2 char) device letter.
 * This works for i up to 26*26. 0 is "a". The buffer is zero
 * terminated.
 *
 *   @i number to be translated to x[y] alphabetical  form.
 */
static char *
device_letter (int i)
{

	static char buf[3];
	static int cached_i = -1;

	if (cached_i == i)
		return buf;

	cached_i = i;

	if (i < 26) {
		buf[0] = 'a' + i;
		buf[1] = 0;
		return buf;
	}

	buf[0] = 'a' + i / 26;
	buf[1] = 'a' + i % 26;
	buf[2] = 0;
	return buf;
}

/*
 * PTB auxiliary functions for manipulating the sequence number. Isn't
 * there anything private we can use in a request?
 *
 * This function returns the sequno
 *
 *   @req  the request to get the sequence number of
 */
static int
rq_seqno (struct request *req)
{
	return req->flags >> __REQ_NBDSEQNO;
}
static void
rq_set_seqno (struct request *req, int val)
{
        // PTB preserve first __REQ_NR_BITS bits
	req->flags &= REQ_NBDSEQNO - 1;
        // PTB shift by one more than strictly necessary (see rq_seqno)
	req->flags |= val << __REQ_NBDSEQNO;
}

/*
 *  PTB sync the device. Modes:
 *  @arg = 1:  Do it sync
 *  @arg = 0:  Do it async
 *
 *  We can't call sync_dev outside a process context. I don't know why.
 *  Death results from a scheduled attempt.
 *
 *  Call without the semaphore held, as we lock it and call sync_dev.
 */
static void
enbd_sync (struct enbd_device *lo, long arg)
{
	struct inode *inode = lo->inode;
	short minor, nbd, islot;

	islot = atomic_read (&lo->islot);

	if (!(atomic_read (&lo->flags) & ENBD_INITIALISED) || !inode) {
		goto fail;
	}

	minor = minor (inode->i_rdev);
	nbd = minor >> ENBD_SHIFT;

	// PTB sync_dev is async. fsync_dev is sync.
	switch (arg) {
	  case 0: // async
	        // PTB 2.5.7 does not have async sync!	 FIXME
		break;
	  default: // sync
		fsync_bdev (inode->i_bdev);
		invalidate_buffers (mk_kdev (major, nbd << ENBD_SHIFT));
		break;
	}

	return;

      fail:
}

static void
enbd_async_sync (struct enbd_device *lo)
{
	enbd_sync (lo, 0);
}
static void
enbd_sync_sync (struct enbd_device *lo)
{
	enbd_sync (lo, 1);
}

/*
 *  Do sync async if we're enabled, sync if we're not.
 *
 *    @lo the device to maybe sync (sync or async sync!)
 */
static void
enbd_maybe_sync_sync (struct enbd_device *lo)
{

	if ((atomic_read (&lo->flags) & ENBD_ENABLED)
        &&  !(atomic_read (&lo->flags) & ENBD_REMOTE_INVALID)) {
		enbd_async_sync (lo);
		return;
	}
	enbd_sync_sync (lo);
}




/*
 * PTB - put a request onto the head of a nbd device's queue
 *     - presumably having taken it off the kernel's queue first!
 *     - We take the queue spinlock.
 *
 *     @lo      = the device we are on (could we get it from the req?)
 *     @req     = the request we shift
 *     @irqsave = save and restore irqmask when taking our queue spinlock
 */
static void
enbd_enqueue (struct enbd_device *lo, struct request *req)
{
	unsigned long req_blks = nr_blks (req);

	if (req_blks < 0) {
	        short islot = atomic_read (&lo->islot);
		ENBD_ERROR ("(%d): invalid req %p. Not touching!\n", islot, req);
		return;
	}

	/* PTB accounting and nothing more - first, specials */
        if (! (req->flags & REQ_SPECIAL)) {
                // PTB the special req counting semantics relies on 
                // countq not including itself in the count!
                int countq;
                int cmd;
                struct enbd_acct *acct = &lo->acct;

	        cmd = rq_data_dir (req);
	        atomic_add (req_blks, &acct->requests_in[cmd]);

                // PTB do we need locks here? Apparently not.
	        atomic_inc (&acct->countq[cmd]);
                countq = atomic_read (&acct->countq[cmd]);

                // PTB the maxes are just noncritical stats
	        if (atomic_read (&acct->maxq[cmd]) < countq)
		        atomic_set (&acct->maxq[cmd], countq);
	        atomic_inc (&acct->req_in[cmd][req_blks]);
                // PTB the maxes are just noncritical stats
	        if (atomic_read (&acct->maxreqblks) < req_blks)
		        atomic_set (&acct->maxreqblks, req_blks);
        }

	write_lock (&lo->queue_lock);

        list_add (&req->queuelist, &lo->queue);

	write_unlock (&lo->queue_lock);

	wake_up_interruptible (&lo->wq);

}

/*
 * PTB - remove a request from anywhere in the nbd device general queue 
 *     - return 0 for success, -ve for fail
 *
 *     We need to hold the queue lock when calling this routine.
 *     It walks the queue.
 *
 *     @lo the nbd device 
 *     @req the request to be removed
 */
static int
enbd_remove (struct enbd_device *lo, struct request *req)
{
	int cmd;
        struct enbd_acct *acct = &lo->acct;

	if (!req)
		return -EINVAL;

	list_del_init (&req->queuelist);

        /* PTB accounting and nothing more */
	cmd = rq_data_dir (req);
        atomic_dec (&acct->countq[cmd]);
	return 0;
}

/*
 *  PTB - Open the device. This is the blkops function.
 */
int
enbd_open (struct inode *inode, struct file *file)
{
	int dev;
	struct enbd_device *lo;
	int nbd;
	int part;
	int islot;
	char *devnam;

	if (!inode && file) {	/* added by ptb for 2.0.35. Necessary? */
		inode = file->f_dentry->d_inode;
	}
	if (!inode) {
		ENBD_ERROR ("null inode.\n");
		return -EINVAL;
	}

	dev = minor (inode->i_rdev);
	nbd = dev >> ENBD_SHIFT;
	part = dev - (nbd << ENBD_SHIFT);
	islot = part - 1;

	if (nbd >= MAX_NBD) {
		ENBD_ERROR ("too many (%d) whole devices open\n", nbd);
		return -ENODEV;
	}

	lo = &enbd_dev[nbd];
	devnam = lo->devnam;

        /* PTB provision for opening for direct i/o - gives mount aid */
        if (file
                && (atomic_read(&lo->flags) & ENBD_DIRECT)
                && !(file->f_flags & O_DIRECT)) {
                    /* PTB we set NOFOLLOW to show we did it ! */
                    file->f_flags |= O_DIRECT | O_NOFOLLOW;
        }

	if (part == 0) {
		/* PTB we have got the whole dev's file or inode for 1st time */
		if (!lo->file || lo->file != file) {
			lo->file = file;
                        atomic_set (&(&lo->wspeed)->frstj, jiffies);
                        atomic_set (&(&lo->rspeed)->frstj, jiffies);
                        atomic_set (&(&lo->tspeed)->frstj, jiffies);
		}
		if (!lo->inode || lo->inode != inode) {
			lo->inode = inode;
		}
	        if (!(atomic_read (&lo->flags) & ENBD_INITIALISED)) {
		        atomic_set_mask (ENBD_INITIALISED, &lo->flags);
	        }
	}

	atomic_inc (&lo->refcnt);

        if (!(atomic_read (&lo->flags) & ENBD_VALIDATED)
            && lo->aslot > 0
            && (atomic_read (&lo->flags) & ENBD_ENABLED)) {
	        ENBD_INFO ("partition check on device nd%s\n", lo->devnam);
                check_disk_change(inode->i_bdev);

                /*
                 * PTB do we set VALIDATED here, or let the kernel call
                 * sequence result in it happening via our removable
                 * device routines? Let's go for the latter option.
                 */
        }

	return 0;
}

/*
 * PTB - complete a transaction irrefutably by taking it out of the
 *     - slot pending position it is in, and reporting end_request to kernel
 *
 *       We are called without locks because our call to end request
 *       will take some sort of lock momentarily and we don't need
 *       locks because our request should already be off all queues.
 *
 *       @slot the enbd_slot on which the req notionally was
 *       @req  the poor defenceless kernel request about to be acked
 */
void
enbd_commit (struct enbd_slot *slot, struct request *req)
{

	struct enbd_device *lo = slot->lo;
	unsigned long req_blks = nr_blks (req);
	int cmd;
        struct enbd_acct * acct = &lo->acct;

	if (req_blks < 0) {
		ENBD_ERROR ("corrupted req %p. Not touching with bargepole.\n",
                        req);
		return;
	}

	list_del_init (&req->queuelist);

	enbd_end_request_lock (req);
        blk_put_request (req);

	slot->req_age = 0;
	slot->req -= req_blks;

        /* PTB accounting and nothing more */
	cmd = rq_data_dir (req);

	atomic_sub (req_blks, &acct->requests_req[cmd]);
	if (req->errors != 0) {
                /* PTB error exit */
		atomic_add (req_blks, &acct->requests_err);
		slot->err += req_blks;
		return;
	}

	atomic_add (req_blks, &acct->requests_out[cmd]);
	slot->out += req_blks;

	if (cmd != WRITE)
                /* PTB everything but a write was easy */
		return;

	/*
         * PTB now non error case writes
         *
	 *     account the 4 cases for a md5sum'd transaction
         */ 

	switch (slot->flags & (ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK)) {

	  case ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK:
		atomic_add (req_blks, &lo->wrequests_5to);	// 11
		atomic_add (req_blks, &lo->wrequests_5so);
		// PTB zero the countdown to turning off md5 as it works
		atomic_set (&lo->wrequests_5co, 0);
		break;

	  case ENBD_SLOT_MD5SUM:
		atomic_add (req_blks, &lo->wrequests_5to);	// 10
		atomic_add (req_blks, &lo->wrequests_5wo);
		atomic_inc (&lo->wrequests_5co);
		if (atomic_read (&lo->wrequests_5co) > md5_off_threshold) {
			atomic_set (&lo->wrequests_5co, 0);
			// PTB turn off md5summing as it is not successful
			atomic_clear_mask (ENBD_MD5SUM, &lo->flags);
		}
		break;

	  case ENBD_SLOT_MD5_OK:
		atomic_add (req_blks, &lo->wrequests_5to);	// 01
		atomic_add (req_blks, &lo->wrequests_5eo);
		atomic_inc (&lo->wrequests_5co);
		if (atomic_read (&lo->wrequests_5co) > md5_off_threshold) {
			atomic_set (&lo->wrequests_5co, 0);
			// PTB turn off md5summing as it is errored
			atomic_clear_mask (ENBD_MD5SUM, &lo->flags);
		}
		break;

	  default:
	  case 0:
		// PTB nobody asked for a md5 and nobdy gave one back
		atomic_inc (&lo->wrequests_5no);
		if (atomic_read (&lo->wrequests_5no) > md5_on_threshold) {
			atomic_set (&lo->wrequests_5no, 0);
			// PTB turn on md5summing every so often
			atomic_set_mask (ENBD_MD5SUM, &lo->flags);
		}
		break;
	}

	// PTB clear the md5sum indicators from the slot afterwards!
	slot->flags &= ~(ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK);

        // PTB we ran out of difficult cases, so return
}

/*
 * PTB - error out a transaction irrefutably by taking it out of the
 *     - slot pending position it is in, and reporting end_request to kernel
 *
 *     We must be called without spinlocks held, as we take it in end req
 *
 *       @slot the enbd_slot on which the req notionally was
 *       @req  the poor defenceless kernel request about to be errored
 */
void
enbd_error (struct enbd_slot *slot, struct request *req)
{
	struct enbd_device *lo = slot->lo;
	unsigned long req_blks = nr_blks (req);
	int cmd;
        struct enbd_acct * acct = &lo->acct;

	if (req_blks < 0) {
		ENBD_ERROR ("passed illegal request %p\n", req);
	}

	req->errors++;

	/*
         * PTB We don't need the queue spinlock since we don't touch our queue,
	 * and we're the only ones working on this slot.
         */
	list_del_init (&req->queuelist);

	ENBD_ALERT ("error out req %p from slot %d!\n", req, slot->i);

	enbd_end_request_lock (req);
        blk_put_request (req);

	/* PTB accounting and nothing more */
	cmd = rq_data_dir (req);
	atomic_sub (req_blks, &acct->requests_req[cmd]);

	slot->in -= req_blks;
	slot->req -= req_blks;

	slot->req_age = 0;
	slot->err += req_blks;
	atomic_add (req_blks, &acct->requests_err);
}

/*
 * Take a request out of a slot. This must not hold the queuelock on
 * entry as we take the queue lock in order to play with the devices
 * queue.
 *
 *  @slot the nbd slot on which to work
 *  @req the request
 */
static void
enbd_rollback (struct enbd_slot *slot, struct request *req)
{

	struct enbd_device *lo = slot->lo;
	unsigned long req_blks, flags;
	int seqno;
        struct list_head *pos;
        struct request *xreq;

	if (atomic_read (&lo->flags) & ENBD_SHOW_ERRS) {
		enbd_error (slot, req);
		return;
	}

	req_blks = nr_blks (req);

	if (req_blks < 0) {
		ENBD_ERROR ("passed illegal request %p\n", req);
		return;
	}

	list_del_init (&req->queuelist);

	ENBD_ALERT ("rollback req %p from slot %d!\n", req, slot->i);

        if (! (req->flags & REQ_SPECIAL)) {
	        /* PTB accounting */
	        slot->in -= req_blks;
	        slot->req -= req_blks;
        }

        seqno = rq_seqno(req);

        write_lock_irqsave(&lo->queue_lock, flags);
        list_for_each_prev (pos, &lo->queue) {
                xreq = list_entry (pos, struct request, queuelist);
                if (rq_seqno(xreq) > seqno) {
                        break;
                }
        }
        list_add_tail (&req->queuelist, pos);
        write_unlock_irqrestore(&lo->queue_lock, flags);

}

/*
 * PTB - undo transactions by taking them out of the slot pending
 *     - position and replacing them on the generic device queue
 *     - NB we do not hold the io request lock or queue sem when
 *     -    calling this as we take it internall in enbd_rollback
 *
 *     @slot the nbd slot to scan
 */
static void
enbd_rollback_all (struct enbd_slot *slot)
{

	struct request *req;
	short count = 0;

	while (!list_empty (&slot->queue)) {

		if (count++ > 1000)
			break;

		req = list_head (&slot->queue, struct request, queuelist);

		if (!req)
			break;

		enbd_rollback (slot, req);
	}

}

/*
 * PTB error out all the requests on a slot
 *     
 *     We must be called without the io spinlock held, as we take it in
 *     enbd_error().
 *
 *     @slot the nbd slot to scan
 */
static void
enbd_error_all (struct enbd_slot *slot)
{

	struct request *req;
	short count = 0;

	while (!list_empty (&slot->queue)) {
		if (count++ > 1000)
			break;
		req = list_head (&slot->queue, struct request, queuelist);
		if (!req)
			break;
		enbd_error (slot, req);
	}
}

/*
 * PTB - let a request onto the slot pending position
 *     - Can be called without the spinlock and doesn't take the
 *       spinlock as we  only deal with our  unique slot. If there
 *       were more than one client per slot this woould be a problem
 *       but there aren't so it isn't.
 *
 *       @slot the nbd slot to let the request onto
 *       @req the request to move onto the slot queue
 */
void
enbd_accept (struct enbd_slot *slot, struct request *req)
{

	struct enbd_device *lo = slot->lo;
	unsigned long req_blks = nr_blks (req);
	int cmd;
        struct enbd_acct * acct = &lo->acct;

	if (req_blks < 0)
		return;

        /* PTB accounting and nothing more */
	cmd = rq_data_dir (req);

	atomic_add (req_blks, &acct->requests_req[cmd]);
	/* PTB - Note that this really is slot and not lo.
	 */
	list_add (&req->queuelist, &slot->queue);

	slot->req_age = jiffies;
	slot->in += req_blks;
	slot->req += req_blks;
}

/*
 * PTB - read from userspace to a request buffer. Do it piecewuse
 *     - to cope with clustered requests.
 *     - return number of bytes read
 *
 *     Unfortunately the only way we can return less than the right
 *     number of bytes is when the receiving req does not have the
 *     right number of buffers, because the copy_from_user itself
 *     doesn't tell us.
 */
static int
copy_from_user_to_req (struct request *req, char *user, int len)
{

	unsigned size = 0;
        struct bio *bio /* = req->bio */;

	/* PTB assume user verified */

        rq_for_each_bio(bio, req) {

            int i;
            struct bio_vec * bvl;

            bio_for_each_segment(bvl, bio, i) {

                struct page *page       = bvl->bv_page;
                int offset              = bvl->bv_offset;
                const unsigned current_size
                                    = bvl->bv_len;
	        char *buffer;
                buffer = page_address(page) + offset;
 
		copy_from_user (buffer, user + size, current_size);

		size += current_size;
	    }
	}
	if (size != len) {
		ENBD_ALERT ("requested %d and only read %d bytes to req %p\n",
		  len, size, req);
		ENBD_ALERT ("request %p wanted to read user space buffer %p\n",
		  req, user);
	}
	return size;
}

/*
 * PTB - andres' kernel half of the user-space network handshake, used
 *     - to complete a transaction.
 *     - return 0 for success and -ve for fail.
 *
 *     @slot the nbd slot being acted on
 *
 */
int
enbd_ack (struct enbd_slot *slot)
{
	struct enbd_reply reply;
	struct request *req, *xreq;
	int result = 0;

	void *user;
	unsigned long req_blks = 1;
	struct enbd_device *lo = slot->lo;
        struct enbd_acct * acct = &lo->acct;
	unsigned buflen = 0;
	unsigned reqlen;
        int cmd;
	struct list_head *pos;
	int count = 0;

	if (!(slot->flags & ENBD_SLOT_BUFFERED)) {
		return -EINVAL;
	}

	atomic_inc (&acct->cthreads);
	slot->flags |= ENBD_SLOT_RUNNING;
	slot->cli_age = jiffies;

	user = slot->buffer;
	copy_from_user ((char *) &reply, (char *) user,
			sizeof (struct enbd_reply));

	// PTB we keep tracking the write position in the input buffer
	buflen += ENBD_BUFFER_DATA_OFFSET;

	// PTB save the reply handle (which is an address) as our req
	memcpy (&req, &reply.handle, sizeof (req));

	xreq = NULL;
	list_for_each (pos, &slot->queue) {
		xreq = list_entry (pos, struct request, queuelist);
		if (count++ > 1000)
			break;
		if (xreq == req)
			/* PTB found it */
			break;
	}

	if (xreq != req) {

		if (slot->nerrs++ < 3)
			ENBD_ALERT ("fatal: Bad handle %p != %p!\n",
			  req, xreq);

                atomic_dec (&acct->cthreads);
                slot->flags &= ~ENBD_SLOT_RUNNING;

                ENBD_ALERT("ignoring ack of req %p which slot does not have\n", 
                    req);

                /*
                 * PTB we lie and say success because userspace got through to
                 * us OK and the req they missed has been rolled back and will
                 * be retransmitted by the kernel later and elsewhere
                 */
                return 0;
	}

	if (reply.magic != ENBD_REPLY_MAGIC) {

		if (slot->nerrs++ < 3)
			ENBD_ALERT ("Not enough reply magic in %s\n",
				   __FUNCTION__ );
                /*
                 * PTB returning -EAGAIN causes the client to pause 0.5s 
                 * and throw its reply away, then return to service. We leave
                 * any request we have to age and be rolled back.
                 */
	        return -EAGAIN;
	}

	if (reply.error > 0 || req->errors > 0) {
		/* PTB wasn't error++'ed before */
		req->errors++;
		if (slot->nerrs++ < 3)
			ENBD_ALERT ("exited with reply error\n");
	        /* PTB we handle this - it's a repmote error */
                ENBD_FAIL ("remote error on request\n");
	}

	req_blks = nr_blks (req);

	reqlen = req->nr_sectors;
	reqlen <<= 9;

	cmd = rq_type (req);

        switch (cmd) {

                unsigned long rcmd;
                char * arg;
		int size;

	  case READ:

		// PTB We have to copy the buffer bit by bit in
		// case the request is clustered.

		size =
		 copy_from_user_to_req (req, ((char *) user) + buflen, reqlen);
		if (size < reqlen) {
			ENBD_ALERT
			 ("(%d): copy %dB from user to req %p failed (%d)\n",
			  slot->i, reqlen, req, size);
			// PTB we could try again? We should investigate.
			ENBD_FAIL
			 ("exited because of bad copy from user\n");
                        // PTB FIXME - think we want to discard and retry
		}

		// PTB we keep tracking the write position in the buffer
		buflen += size;
		break;

	  case WRITE:
		/*
                 * PTB we want to know if the reply is md5summed, and if it is
		 *     whether the md5sum is the same as the one on the
		 *     request. But that's not something we can presently see
		 *     from here as we don't make an md5sum in the kernel.
		 *     So we have to rely on the reply flag from userspace.
		 *     We transmit the information to the slot, as we can't
		 *     keep it on the request.
                 */

		switch (reply.flags &
			(ENBD_REPLY_MD5SUM | ENBD_REPLY_MD5_OK)) {

		  case ENBD_REPLY_MD5SUM | ENBD_REPLY_MD5_OK:
			/*
                         * PTB we asked for an md5sum comparison and
			 * the two matched, so we skipped writing the request
                         */
			slot->flags |= (ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK); //11
			break;
		  case ENBD_REPLY_MD5SUM:
			// PTB the two differed, so we wrote the request
			slot->flags |= ENBD_SLOT_MD5SUM;
			slot->flags &= ~ENBD_SLOT_MD5_OK;	// 10
			break;
		  case ENBD_REPLY_MD5_OK:
			// PTB the server refused the md5 request
			slot->flags &= ~ENBD_SLOT_MD5SUM;
			slot->flags |= ENBD_SLOT_MD5_OK;	        // 01
			break;
		  default:
		  case 0:
			// PTB mobody asked for an md5sum comparison
			slot->flags &= ~(ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK);//00
			break;
		}
		// PTB now we are all set up to do the accounting in commit etc.
		break;

          case SPECIAL:
                // PTB FIXME. Just temporary. 
		ENBD_ALERT ("special req %p on slot %d\n", req, slot->i);
                req->errors = 0;
                goto success;
		break;


	  case IOCTL:

		if (!(reply.flags & ENBD_REPLY_IOCTL))
			ENBD_ALERT ("ioctl reply to req %p has no ioctl flag\n",
                                req);

		// PTB the commit should emit the request notification

		rcmd = (long) req->special;
		arg = req->buffer;

		if (cmd == -1l) {
			result = -EINVAL;
			ENBD_FAIL ("unauthorized remote ioctl\n");
		}

		if (!(_IOC_DIR (cmd) & _IOC_READ)) {
                        break;
                }

		/*
                 * PTB We saved ioctl size in req .. but only approximately,
                 * as nr_sectors.
                 */

		/*
                 * PTB if we are reading, it should be to the local
                 * buffer arg, which points at lo->ctldata or other buffer
                 */

                // PTB we are treating a saved local address or direct val
		if (req->nr_sectors > 0) {
			/*
                         * PTB sectors is an overestimate.  Should be
                         * OK as we are reading from the client
                         * buffer which has plenty of room to spare.
                         */
			int size = req->nr_sectors << 9;
			copy_from_user (arg, (char *) user + buflen, size);
			buflen += size;
                        break;
		}

		break;
	}			// PTB eswitch
        goto success;

      success:
	slot->nerrs = 0;
	/*
         * PTB - completion (or erroring) of transaction.
	 * note that enbd_commit will take a lock to do end_req
         */
	enbd_commit (slot, req);
	atomic_dec (&acct->cthreads);
	slot->flags &= ~ENBD_SLOT_RUNNING;
	return 0;

      error_out:
	/* PTB we will next do a client rollback on the slot from userspace.
	 *     Right here we just skip the request. 
         *     But .. don't error the request. We might have rolled it
         * back and be referencing it.
         */
        if (result != -EAGAIN && result != 0) {
	        req->errors += req_blks;
	        slot->err += req_blks;
        }
	result = result < 0 ? result : -ENODEV;
        // PTB one client thread leaves
	atomic_dec (&acct->cthreads);
	slot->flags &= ~ENBD_SLOT_RUNNING;
	return result;
}

/*
 * PTB - write to userspace from a request buffer. Do it piecewuse
 *     - to cope with clustered requests.
 *     - return number of bytes written
 */
static int
copy_to_user_from_req (struct request *req, char *user, int len)
{

	unsigned size = 0;
        struct bio *bio /* = req->bio */;

	/* PTB assume user verified */

        rq_for_each_bio(bio, req) {

            int i;
            struct bio_vec * bvl;

            bio_for_each_segment(bvl, bio, i) {

                struct page *page       = bvl->bv_page;
                int offset              = bvl->bv_offset;
                const unsigned current_size
                                        = bvl->bv_len;
	        char *buffer;
                buffer = page_address(page) + offset;
 
		copy_to_user (user + size, buffer, current_size);

		size += current_size;
            }

	}
	return size;
}

/*
 * PTB do the devices three speed updates
 *
 *  @lo  the nbd device to do the update on
 */
static void
enbd_set_speed (struct enbd_device *lo)
{
        int r, w, t;
        struct enbd_speed *wspd = &lo->wspeed;
        struct enbd_speed *rspd = &lo->rspeed;
        struct enbd_speed *tspd = &lo->tspeed;
        struct enbd_acct *acct = &lo->acct;
	w = atomic_read (&acct->requests_in[WRITE]);
	wspd->update (wspd, w);
	r = atomic_read (&acct->requests_in[READ]);
	rspd->update (rspd, r);
        t = w + r;
	tspd->update (tspd, t);
}



/*
 * PTB - andres' kernel half of the userspace networking. This part
 *     - initiates the transaction by taking a request off the generic
 *     - device queue and placing it in the slots pending position.
 *     - I believe we return 0 for success and -ve for fail.
 *     - timeo is the number of jiffies we are prepared to wait
 *
 *     @slot the nbd slot to act on.
 */
int
enbd_get_req (struct enbd_slot *slot)
{
	struct enbd_request request;
	struct request *req;
	int result = 0;
	static atomic_t count;
	unsigned start_time = jiffies;
	struct enbd_device *lo = slot->lo;
        struct enbd_acct * acct = &lo->acct;
	unsigned timeout = lo->req_timeo * HZ;
	int islot = slot->i;
	// PTB for the new timezone field in requests 
	extern struct timezone sys_tz;
        struct timeval time;
	unsigned long flags;
        struct enbd_seqno * seqno_out = &lo->seqno_out;

	atomic_inc (&acct->cthreads);	// PTB - client thread enters
	slot->flags |= ENBD_SLOT_RUNNING;
	slot->cli_age = jiffies;

	if (!(slot->flags & ENBD_SLOT_BUFFERED)) {
		ENBD_FAIL ("Our slot has no buffer");
	}

	atomic_set (&lo->islot, islot);

	if (!list_empty (&slot->queue)) {
		ENBD_FAIL ("impossible! already treating one request");
		// PTB we do a nontrivial rollback from the user daemon 
	}
	if (!slot->file) {
		result = -EBADF;
		ENBD_FAIL ("Our slot has been nofiled");
	}
	if (!(atomic_read (&lo->flags) & ENBD_ENABLED)) {
		result = -ENODEV;
		ENBD_FAIL ("Our slot has been vamooshed");
	}

	atomic_inc (&acct->cwaiters);
	slot->flags |= ENBD_SLOT_WAITING;

	// PTB take spinlock in order to examine queue
	// we need to protect ourselves against the request fn too
	read_lock_irqsave (&lo->queue_lock, flags);
	atomic_dec (&acct->cwaiters);
	slot->flags &= ~ENBD_SLOT_WAITING;

	// PTB - now spin until request arrives to treat 
	while (slot->file && list_empty (&lo->queue)) {

		static int enbd_clr_sock (struct enbd_slot *slot); // forward decl
		int siz;
                int time_left = start_time + timeout - jiffies;

		read_unlock_irqrestore (&lo->queue_lock, flags);

                // PTB one client thread goes to sleep
		atomic_inc (&acct->cwaiters);
		slot->flags |= ENBD_SLOT_WAITING;

		interruptible_sleep_on_timeout (&lo->wq, time_left);

		slot->flags &= ~ENBD_SLOT_WAITING;
                // PTB one client thread reactivates
		atomic_dec (&acct->cwaiters);
		atomic_inc (&count);

		// PTB Have to take the spinlock again to check at the queue
		atomic_inc (&acct->cwaiters);
		slot->flags |= ENBD_SLOT_WAITING;
		// we need to protect ourselves against the request fn too
		read_lock_irqsave (&lo->queue_lock, flags);
		atomic_dec (&acct->cwaiters);
		slot->flags &= ~ENBD_SLOT_WAITING;

		// PTB fail for recheck if we are inactive too long 

                time_left = start_time + timeout - jiffies;
		if (time_left > 0 || !list_empty (&lo->queue))
                        continue;

                // PTB bad. timeout with nothing on queue. Error out.
		result = -ETIME;

		// PTB we will exit with fail, so up spinlock now
		read_unlock_irqrestore (&lo->queue_lock, flags);

		siz = lo->blksize + sizeof (struct enbd_request);
		// PTB verify the buffer is still OK - holds one block 
		if (access_ok(VERIFY_WRITE,slot->buffer,siz))
                        goto error_out;

                // PTB buffer is invalid
		result = -EINVAL;

		// PTB clr_sock takes both the io lock and the spinlock
		enbd_clr_sock (slot);
		ENBD_FAIL ("Our process has died or lost its buffer");

		/*
                 * PTB we may do a rollback from the user daemon here
	         * but it'll be trivial - without effect - as we don't
		 * have a request in our slot to treat.
                 */
		goto error_out;

	} // end while loop

	// PTB we still have the (read) spinlock here

	if (!(atomic_read (&lo->flags) & ENBD_ENABLED)) {
		read_unlock_irqrestore (&lo->queue_lock, flags);
		result = -ENODEV;
		ENBD_FAIL ("Our slot vaporized while we slept!");
	}
	if (!slot->file) {
		read_unlock_irqrestore (&lo->queue_lock, flags);
		result = -EBADF;
		ENBD_FAIL ("Our slot nofiled itself while we slept!");
	}
	if (!list_empty (&slot->queue)) {
		read_unlock_irqrestore (&lo->queue_lock, flags);
		result = -EINVAL;
		ENBD_FAIL ("impossible! already treating one request");
		// PTB we do a nontrivial rollback from the user daemon 
	}

	// PTB now relinquish the read lock and try for the write lock
	read_unlock_irqrestore (&lo->queue_lock, flags);

	write_lock_irqsave (&lo->queue_lock, flags);
	// PTB got the write lock

	if (list_empty (&lo->queue)) {
		write_unlock_irqrestore (&lo->queue_lock, flags);
		// PTB - somebody else did it while we waited on spinlock. OK 
		result = -EINVAL;
		ENBD_FAIL ("ho hum beaten to the punch");
		// PTB we may do a trivial rollback from the user daemon 
	}

	// PTB cli/sti here looks unnec. hardware interrupts return here 
	// AMARIN begin uninterruptible code 

	// PTB we have the (write) spinlock

	// PTB oldest=last element in queue 
	req = list_tail (&lo->queue, struct request, queuelist);

	// PTB this is where we free the req from our queue. We need to be
	// holding our spinlock at this point

	// PTB - must succeed as have the spinlock 
	result = enbd_remove (lo, req);
	// PTB now holding irqs off in enbd_remove 

	// AMARIN end uninterruptable code 
	// PTB uh - maybe cli/sti is needed? interrupts can muck the queue?
	//        - Nah! I have left them enabled so we can see any errors.

	write_unlock_irqrestore (&lo->queue_lock, flags);

	request.magic = ENBD_REQUEST_MAGIC;
	request.flags = 0;

	switch (rq_type (req)) {

		unsigned long cmd;
		char *arg;
		size_t size;

	  case IOCTL:

		request.type = IOCTL;

		// PTB this is our special ioctl kernel request

		cmd = (unsigned long) req->special;
		arg = req->buffer;
		size = req->nr_sectors << 9;

		// PTB the arg was a literal

		request.len = 0;
		// PTB we are in get_req, transferring stored ioctl
		if ((_IOC_DIR (cmd) & _IOC_READ) && size > 0) {
			// PTB if len is +ve we copy to the user buffer later
                        request.len = size;
		} 
		// PTB we store the weirded ioctl id.
		// PTB Yes, this composition is our private invention.
		request.from = (((__u64) cmd) << 32)
		     // PTB really want this to go to a 64 bit request.special
		     | ((__u64) (unsigned long) arg);
		break;

	  case READ:
	  case WRITE:

		request.type = rq_data_dir (req);
		request.from = req->sector;
		request.from <<= 9;
		request.len = req->nr_sectors;
		request.len <<= 9;
		if (atomic_read (&lo->flags) & ENBD_MD5SUM) {
			// PTB set the please do md5sum flag on the request
			request.flags |= ENBD_REQUEST_MD5SUM;
		}
		break;

          case MD5SUM:
                break;

          case SPECIAL:
		request.type = SPECIAL;
		request.len = req->nr_sectors;
		request.len <<= 9;
		request.from = req->sector;
		request.from <<= 9;
                if (rq_data_dir (req) == WRITE)
                        request.flags |= ENBD_REQUEST_SPECIALRW;
                request.special = (typeof(request.special))req->special;
                break;

	  default:
		ENBD_ALERT ("received unknown req %p type %#x\n",
			   req, rq_type (req));
		break;
	}

	request.seqno = seqno_out->calc(seqno_out, rq_seqno (req));

	/*
         * PTB we should here erase the extra seqno info in the request
         * so that on error or on ack the kernel can use the right internal
         * array, but I'll erase it in the ack function instead
         */

	do_gettimeofday (&time);
        request.time = time.tv_sec;
        request.time *= 1000000;
        request.time += time.tv_usec;
        request.zone = sys_tz.tz_minuteswest;

	// PTB tz_dsttime = 0 always in linux

	memcpy (&request.handle, &req, sizeof (request.handle));

	copy_to_user (slot->buffer, (char *) &request, sizeof (request));

	switch (request.type) {

		int err;
                char * arg;

	  case READ:
		break;

	  case IOCTL:
		if (request.len <= 0)
                        break; // PTB presumably nothing to do
		arg = (char *) slot->buffer + ENBD_BUFFER_DATA_OFFSET;
		copy_to_user (arg, req->buffer, request.len);
		break;

	  case WRITE:
		arg = (char *) slot->buffer + ENBD_BUFFER_DATA_OFFSET;
		err = copy_to_user_from_req (req, arg, request.len);
		if (err >= request.len)
                        break; // OK
		// PTB buffer had missing BHSs
		ENBD_ERROR ("req %p offered %d bytes of %d for copy to user\n",
		        req, result, request.len);
		// PTB this request is badly damaged. We had better shoot it.
		if (req && req->errors == 0) {
			req->errors++;
			enbd_end_request_lock (req);
                        blk_put_request (req);
		}
		ENBD_FAIL ("kernel failed to keep req while we copied from it");
		break;
          case MD5SUM:
                break;
          case SPECIAL:
                // PTB temporary. We do not treat specials at the moment.
		req->errors = 0;
                break;
          default:
		ENBD_ERROR ("req %p was type %#x\n", req, rq_type(req));
		ENBD_FAIL ("unknown req type");
                break;
	}

	/*
         * PTB enbd_accept does not take spinlock and does not need to as
	 * the req is already free of the shared queue and only needs
	 * to be placed on the unique slot queue.
         */

	enbd_accept (slot, req);

	atomic_dec (&acct->cthreads);	// PTB - client thread leaves normally 
	slot->flags &= ~ENBD_SLOT_RUNNING;

	return 0;

      error_out:
	// PTB accounting - a fail to get a request is not an errored request 
	atomic_dec (&acct->cthreads);	// PTB - client thread leaves abnormally 
	slot->flags &= ~ENBD_SLOT_RUNNING;
	result = result < 0 ? result : -ENODEV;

	return result;
}

/*
 * PTB error out the pending requests on the kernel queue
 * We have to be called WITHOUT the io request lock held.
 * We sleep imbetween clearing each request, for "safety".
 *
 *   @lo the nbd device to scan
 */
static int
enbd_clr_kernel_queue (struct enbd_device *lo)
{

	int count = 0;
	unsigned long flags;
        request_queue_t *q = lo->q;

	spin_lock_irqsave (q->queue_lock, flags);

	while (! blk_queue_empty(q) && count++ < 1000) {
		struct request *req;
                req = elv_next_request(q);
		if (!req) {	// PTB impossible
			spin_unlock_irqrestore (q->queue_lock, flags);
			ENBD_ALERT
			 ("impossible! kernel queue empty after tested nonemty!\n");
			goto fail;
		}
		blkdev_dequeue_request (req);
		spin_unlock_irqrestore (q->queue_lock, flags);
                if (!req->errors)
		        req->errors++;
		schedule ();
		enbd_end_request_lock (req);
                blk_put_request (req);
	        spin_lock_irqsave (q->queue_lock, flags);
	}
	spin_unlock_irqrestore (q->queue_lock, flags);
	goto success;

      fail:
	/* PTB fall thru */
      success:
	ENBD_ALERT ("removed %d requests\n", count);
	return count;

}

/*
 * PTB error out the pending requests on the nbd queue and kernel queue
 * Note that we take the queue spinlock for this
 *
 *   @lo the nbd device to scan
 */
static int
enbd_clr_queue (struct enbd_device *lo)
{
	int count = 0;
        struct enbd_acct * acct = &lo->acct;

	while (count < 1000) {

		struct request *req;
		unsigned long req_blks = 1;

                // PTB cannot allow new requests via interrupts
		write_lock (&lo->queue_lock);
		if (list_empty (&lo->queue)) {
			write_unlock(&lo->queue_lock);
			break;
		}
		req = list_head (&lo->queue, struct request, queuelist);
                if (!req) {
			write_unlock(&lo->queue_lock);
			break;
                }

		req_blks = nr_blks (req);

		req->errors += req_blks + 1;
		atomic_add (req_blks, &acct->requests_err);

		/* PTB - must succeed as have the spinlock */
		enbd_remove (lo, req);
		/* PTB now hold irqs off in enbd_remove */
		write_unlock(&lo->queue_lock);
		count++;

		enbd_end_request_lock (req);
                blk_put_request (req);

	}
	ENBD_ALERT ("unqueued %d reqs\n", count);
	return count;
}

/*
 * PTB do under alt spinlock - we take the lo queue_lock oursekves.
 * We take all requests off the alt queue to which they've been
 * diverted and put them on the devices normal queue, where they will
 * then be treated in the normal course of events. They were diverted
 * to the alt queue after we received a SPECIAL, and they're being
 * released now that we've treated all the extant reqs.
 *
 *   @lo the nbd device being treated
 */ 
static int
enbd_requeue (struct enbd_device *lo)
{
	int count = 0;

	while (count < 1000) {

		struct request *req;

		// PTB cannot allow new requests via interrupts
		if (list_empty (&lo->altqueue)) {
			break;
		}
		req = list_tail (&lo->altqueue, struct request, queuelist);
		if (!req)
			break;

		// PTB heisenbug? without these list_del oopses on null deref
		if (req->queuelist.prev == NULL) {
			ENBD_ALERT ("req %p has 0 prev ptr! Abort\n", req);
			break;
		}
		if (req->queuelist.next == NULL) {
			ENBD_ALERT ("req %p has 0 next ptr! Abort\n", req);
			break;
		}
		/* PTB - must succeed as have the spinlock */
		list_del_init (&req->queuelist);
		/* PTB now hold irqs off in enbd_remove */
		count++;

		enbd_enqueue (lo, req);

	}
	return count;
}


#undef ENBD_FAIL
#define ENBD_FAIL( s... ) { \
  ENBD_ERROR( s); printk("\n"); \
  goto error_out; \
}

#ifndef NO_BUFFERED_WRITES
  /*
   * Magic function from rd.c that we hope saves a buffer head
   * permanently somewhere in the kernel VM system.
   */
static int
buffered_write_pagecache_IO (struct buffer_head *sbh, int nbd)
{
	struct address_space *mapping;
	unsigned long index;
	int offset, size, err;
	struct enbd_device *lo = &enbd_dev[nbd];
	err = 0;

	// PTB we need to save the /dev/nda inode
	if (!lo->inode) {
		err = -ENODEV;
		goto out;
	}
	mapping = lo->inode->i_mapping;

	// PTB index appears to be the page number
	index = sbh->b_rsector >> (PAGE_CACHE_SHIFT - 9);
	// PTB offset is in bytes, and says where in the page the sector starts
	offset = (sbh->b_rsector << 9) & ~PAGE_CACHE_MASK;
	// PTB well, an abbreviation for the buffer size, in bytes
	size = sbh->b_size;

	do {
		// PTB we mark each page that we should write to Uptodate

		int count;
		struct page **hash;
		struct page *page;
		char *src, *dst;

		int unlock = 0;

		// PTB ummm, how much of the page is left to traverse
		count = PAGE_CACHE_SIZE - offset;
		// PTB reduce it to how much we actually need to traverse
		if (count > size)
			count = size;
		// PTB say NOW? that we have traversed what we want of the page
		size -= count;

		hash = page_hash (mapping, index);
		page = __find_get_page (mapping, index, hash);

		if (!page) {
			// PTB we get to make a new page
			page = grab_cache_page (mapping, index);
			if (!page) {
				// PTB failed to get new page
				err = -ENOMEM;
				goto out;
			}
			// PTB magic
			if (!Page_Uptodate (page)) {
				memset (kmap (page), 0, PAGE_CACHE_SIZE);
				kunmap (page);
				SetPageUptodate (page);
			}
			// PTB the new page is locked. We need to unlock it later
			unlock = 1;
		}

		// PTB prepare already for next page
		index++;

		// PTB set up for copy
		dst = kmap (page);
		dst += offset;
		src = bh_kmap (sbh);

		// PTB prepare for next round
		offset = 0;

		// PTB do a copy
		memcpy (dst, src, count);

		kunmap (page);
		bh_kunmap (sbh);

		if (unlock) {
			UnlockPage (page);
		}
		SetPageDirty (page);
		__free_page (page);

	} while (size > 0);

      out:
	return err;

}
static int
buffered_write (struct request *req)
{

	struct buffer_head *bh;
	int dev = minor (req->rq_dev);
	int nbd = dev >> ENBD_SHIFT;
	int err = 0;

	// PTB go through and copy and protect the written buffers
	for (bh = req->bh; bh; bh = bh->b_reqnext) {
		struct buffer_head *rbh;
		rbh =
		 getblk (bh->b_rdev, bh->b_rsector / (bh->b_size >> 9),
			 bh->b_size);
		if (bh != rbh) {
			char *bdata = bh_kmap (bh);
			memcpy (rbh->b_data, bdata, rbh->b_size);
			ENBD_ALERT ("got new bh sector %lu on write\n",
				   bh->b_rsector);
		}
		bh_kunmap (bh);
		mark_buffer_protected (rbh);	// PTB equals dirty, uptodate
		err = buffered_write_pagecache_IO (bh, nbd);
		if (err < 0) {
			break;
		}
		brelse (rbh);
	}
	return err;
}

#endif		/* NO_BUFFERED_WRITES */

/* 
 * PTB check if the device is read only according to int flags
 *
 *   @lo the nbd device to be checked
 */
static int
enbd_read_only(struct enbd_device *lo) {
        return (atomic_read(&lo->flags) & ENBD_READ_ONLY) != 0;
}
/*
 * PTB set the device readonly (or not)
 *
 *   @lo the nbd device to be set up
 *   @ro 1 for read only, 0 for read write.
 */
static void
enbd_set_read_only(struct enbd_device * lo, int ro) {

        if (ro != 0) {
                atomic_set_mask (ENBD_READ_ONLY, &lo->flags);
        } else {
	        atomic_clear_mask (ENBD_READ_ONLY, &lo->flags);
	}

        // PTB which device really does not matter. We do the checking.
        set_disk_ro (lo->disk, ro != 0);
}

/*
 * PTB - kernel function to take reqs off the kernel queue. Runs with
 * io lock held. This is the "request function".
 */
static void
do_nbd_request (request_queue_t * q)
{
	struct request *req;
	unsigned long flags;

	while ((req = elv_next_request(q)) != NULL) {

		struct enbd_device *lo;
                struct enbd_acct * acct;



		lo = req->rq_disk->private_data;
                acct = &lo->acct;

                /* PTB - one kernel thread enters */
		atomic_inc (&acct->kthreads);
		if (atomic_read (&acct->kthreads) > atomic_read (&acct->kmax))
			atomic_set (&acct->kmax, atomic_read (&acct->kthreads));

		if (!lo->inode || !lo->file) {
			ENBD_FAIL ("Request when device not ready.");
		}

		if (rq_data_dir (req) == WRITE && enbd_read_only(lo)) {
			ENBD_FAIL ("write on read-only device");
		}
		flags = atomic_read (&lo->flags);
		if (!(flags & ENBD_INITIALISED)) {
			ENBD_FAIL ("device not initialised.");
		}
		if (!(flags & ENBD_ENABLED)) {
			ENBD_FAIL ("device not enabled.");
		}
		if (flags & ENBD_REMOTE_INVALID) {
			ENBD_FAIL ("remote device invalidated.");
		}
		if (req->sector + req->nr_sectors > lo->sectors) {
			ENBD_FAIL ("overrange request");
		}
		if (req->sector < 0) {
			ENBD_FAIL ("underrange request");
		}
 		if (req->rq_disk->major != major) {
 			ENBD_FAIL ("request for wrong major");
 		}
		req->errors = 0;
		blkdev_dequeue_request (req);

                // PTB in 2.5 we can release the iolock briefly here
                spin_unlock_irq(q->queue_lock);

                if (req->flags & REQ_SPECIAL) {
                        // PTB temporary successful end here for SPECIALS

                        // PTB we want to attach it to the device and ack later
		        enbd_enqueue (lo, req);
                        // PTB block further reqs until these have drained
                        write_lock(&lo->altqueue_lock);
                        // PTB do not touch this flag without this lock
                        if (atomic_read(&acct->countq[READ])
                          + atomic_read(&acct->countq[WRITE]) > 0) {
                            atomic_set_mask(ENBD_QBLOCKED, &lo->flags);
                        }
                        write_unlock(&lo->altqueue_lock);
		        goto accounting;
                }

		// PTB we are the only reader and writer of lo->seqno
		if (rq_data_dir (req) == WRITE && rq_seqno (req) == 0) {
			// PTB it is a new request never seen before
                        struct enbd_seqno * seqno_out = &lo->seqno_out;
                        seqno_out->inc(seqno_out);
			/*
                         * PTB we have to be careful to change this back before
			 * giving it back to the kernel, as the kernel uses it.
			 * We patch it back again in enbd_end_request.
                         */
			rq_set_seqno (req, seqno_out->get(seqno_out));
		}

                // if BLOCK is set divert requests to alt queue
                write_lock(&lo->altqueue_lock);
                if (atomic_read(&lo->flags) & ENBD_QBLOCKED) {
                        list_add (&req->queuelist, &lo->altqueue);
                        write_unlock(&lo->altqueue_lock);
		        goto accounting;
                }
                write_unlock(&lo->altqueue_lock);

		// PTB normal sequence is to queue request locally
		enbd_enqueue (lo, req);
		goto accounting;

	      accounting:
		atomic_dec (&acct->kthreads);
                // PTB regain the iolock for another turn
                spin_lock_irq(q->queue_lock);
		continue;	// PTB next request

	      error_out:
		// PTB can rely on req being nonnull here
		ENBD_ALERT ("ending req %p with prejudice\n", req);
		req->errors++;
		blkdev_dequeue_request (req);
                spin_unlock_irq(q->queue_lock);

		enbd_end_request_lock (req);
                blk_put_request (req);

                // PTB more accounting
		if (lo) {
			int req_blks = nr_blks (req);
                        struct enbd_acct * acct = &lo->acct;
			atomic_add (req_blks, &acct->requests_err);
			atomic_dec (&acct->kthreads);
		} else {
                        ENBD_ALERT("failed to account one orphan errored req\n");
                }
                // PTB regain the queue lock for another turn
                spin_lock_irq(q->queue_lock);
                continue;
	}
        return;
}

/*
 * PTB pair of helpful additional functions, only good for 1 bit in the
 * mask, however. Modify if you want more.
 *
 *   @a the atomic element's address
 *   @mask the integer with one bit set in the position that we want to test
 *         and set, or clear
 */
static int
atomic_test_and_set_mask (atomic_t * a, unsigned mask)
{
	int i = ffs (mask);
	if (!i)
		return -EINVAL;
                // PTB gahhhh ...
        #ifdef __LITTLE_ENDIAN
	        return test_and_set_bit (i - 1, (unsigned long *)&a->counter);
        #else
                #ifndef __BIGENDIAN
                #error help, I only know about bigendian or littlendian machines
                #endif
	        return test_and_set_bit
                    (i - 1 + (sizeof(long)-sizeof(a->counter))*8,
                        (unsigned long *)&a->counter);
        #endif
}
static int
atomic_test_and_clear_mask (atomic_t * a, unsigned mask)
{
	int i = ffs (mask);
	if (!i)
		return 0;
                // PTB gahhhh ...
        #ifdef __LITTLE_ENDIAN
	        return test_and_clear_bit (i - 1, (unsigned long *)&a->counter);
        #else
                #ifndef __BIGENDIAN
                #error help, I only know about bigendian or littlendian machines
                #endif
	        return test_and_clear_bit
                    (i - 1 + (sizeof(long)-sizeof(a->counter))*8,
                        (unsigned long *)&a->counter);
        #endif
}


/*
 * PTB - set the enabled flag on a device (call without the spinlock held) 
 *
 *   @lo the nbd device being treated
 */
static void
enbd_enable (struct enbd_device *lo) {
	unsigned long flags;
        int did_enabled = 0;
        struct enbd_md *md = &enbd_md;

	// PTB reenable part
	write_lock_irqsave (&lo->meta_lock, flags);
	if (!atomic_test_and_set_mask (&lo->flags, ENBD_ENABLED)) {
                // PTB was not enabled before
	        atomic_clear_mask (ENBD_VALIDATED, &lo->flags);
	        lo->lives++;
                did_enabled = 1;
	} 
	write_unlock_irqrestore (&lo->meta_lock, flags);

        if (did_enabled)
                md->notify(&enbd_md, mk_kdev (major, lo->nbd << ENBD_SHIFT));
}


/*
 * PTB rollback all requests on a given slot and then invalidate it
 * (so the requests can't go back until somebody reactivates the slot)
 * At least rollback (which we call takes both the io spinlock and our
 * spinlock, so we can hold neither when we are called. Soft_reset
 * (which we call) also calls rollback, so has the same problem.
 *
 *   @slot the nbd slot being treated
 */
static int
enbd_clr_sock (struct enbd_slot *slot)
{
	int i = 0;
	struct enbd_device *lo = slot->lo;
	int islot = slot->i;
	unsigned long flags;
        int do_reset = 0;
        int do_enable = 0;
	static int enbd_soft_reset (struct enbd_device*);

	enbd_rollback_all (slot);

	slot->file = NULL;
	slot->bufsiz = 0;
	slot->flags = 0;
	slot->buffer = NULL;

	write_lock_irqsave (&lo->meta_lock, flags);

	/* PTB reset lo->aslot */

	if (lo->aslot > 0) {

		/* PTB grr .. do this the hard way */
		int aslot = 0;
		for (i = 0; i < lo->nslot; i++) {
			struct enbd_slot *sloti = &lo->slots[i];
			if (sloti->file)
				aslot++;
		}
                lo->aslot = aslot;

		if (lo->aslot <= 0) {
			// PTB we were the last client alive, diasable device
			if (atomic_read (&lo->flags) & ENBD_SHOW_ERRS) {
				// PTB soft_reset will invalidate_buffers
			        atomic_clear_mask (ENBD_ENABLED, &lo->flags);
                                do_reset = 1;
			}
		} else if (!(atomic_read (&lo->flags) & ENBD_ENABLED)) {
			// PTB must not call reenable as that clears the queue
                        do_enable = 1;
		}

	}

        // PTB lift the lock temporarily
	write_unlock_irqrestore(&lo->meta_lock, flags);
        if (do_reset) {
		enbd_soft_reset (lo);
        }
        if (do_enable) {
                enbd_enable (lo);
		ENBD_ALERT ("enabled device nd%s\n", lo->devnam);
        }
	write_lock_irqsave(&lo->meta_lock, flags);

	/* PTB reset lo->islot, for no good reason */

	if (atomic_read (&lo->islot) == islot) {
		for (i = 0; i++ < lo->nslot;) {
			atomic_inc (&lo->islot);
			if (atomic_read (&lo->islot) >= lo->nslot)
				atomic_set (&lo->islot, 0);
			if (lo->slots[atomic_read (&lo->islot)].file)
				break;
		}
	}
	lo->harderror = 0;
	write_unlock_irqrestore (&lo->meta_lock, flags);

	/* PTB don't clear whole device queue as we might still be open */

	return 0;
}

/*
 * PTB - check all slots for old requests and roll them back. 
 * At least rollback (which we call takes both the io spinlock and our
 * spinlock, so we can hold neither when we are called.
 *
 *   @lo the nbd device to scan
 */
static void
enbd_rollback_old (struct enbd_device *lo)
{

	int islot;

	for (islot = 0; islot < lo->nslot; islot++) {
		struct enbd_slot *slot = &lo->slots[islot];
		if (slot->req_age > 0
		    && slot->req_age < jiffies - lo->req_timeo * HZ) {
			enbd_rollback_all (slot);
		}
	}

}

/*
 * PTB - register a socket to a slot.
 *     - Return 0 for success and -ve for failure.
 *       Nowadays this doesn't do very much! Just finalizes things.
 *
 *       @slot  the nbd slot being registered
 */
static int
enbd_set_sock (struct enbd_slot *slot, int arg)
{

	struct enbd_device *lo = slot->lo;
	int islot = slot->i;
	unsigned long flags;
        int do_enable = 0;

	if (!(atomic_read (&lo->flags) & ENBD_INITIALISED)) {
		ENBD_ALERT ("(%d) device nd%s not initialised yet!\n",
			   islot, lo->devnam);
		return -ENODEV;
	}
	if (!(atomic_read (&lo->flags) & ENBD_SIZED)) {
		ENBD_ALERT ("(%d) device nd%s not sized yet!\n", islot,
                        lo->devnam);
		return -EINVAL;
	}
	if (!(atomic_read (&lo->flags) & ENBD_BLKSIZED)) {
		ENBD_ALERT ("(%d) device nd%s not blksized yet!\n", islot,
                        lo->devnam);
		return -EINVAL;
	}
	if (!(atomic_read (&lo->flags) & ENBD_SIGNED)) {
		ENBD_ALERT ("(%d) setting unsigned device nd%s! But harmless.\n",
			   islot, lo->devnam);
		return -EINVAL;
	}

	down (&lo->pid_sem);

	if (slot->pid != current->pid) {
                if (jiffies > slot->cli_age + 2 * HZ * lo->req_timeo) {
		        ENBD_ALERT
		        ("(%d) dead client process %d has nd%s%d, erasing pid!\n",
                         islot, slot->pid, lo->devnam, islot + 1);
                        slot->pid = 0;
                } else {
		        ENBD_ALERT
		        ("(%d) other live client process %d has nd%s%d!\n",
                         islot, slot->pid, lo->devnam, islot + 1);
                }
		up (&lo->pid_sem);
		return -EINVAL;
	}
	up (&lo->pid_sem);

	slot = &lo->slots[islot];

	// PTB this is a queue critical code region for the flags business
	write_lock_irqsave (&lo->meta_lock, flags);

	// PTB file has to be nonzero to indicate we are all set up. 
        slot->file = (void *) (unsigned long) (arg+1 > 0 ? arg+1 : 1);

	if (islot >= lo->nslot) {
		lo->nslot = islot + 1;
		ENBD_INFO ("increased socket count to %d\n", lo->nslot);
	}

	lo->harderror = 0;

        if (lo->disk && !get_capacity(lo->disk)) {
                set_capacity(lo->disk, lo->sectors);
        }
	if (++lo->aslot > 0) {
                do_enable = 1;
        }
	// PTB end of queue critical region
	write_unlock_irqrestore (&lo->meta_lock, flags);

        /*
	 * PTB if this is the first slot, we might call reenable and
	 * thus clr queue too, but reenable takes the spinlock
         */
        if (do_enable)
                enbd_enable(lo);
        
	return 0;
}

/*
 * PTB - return the index i of 2^i + j, 0 <= j < 2^i
 */
static inline unsigned
log2 (unsigned arg)
{
	unsigned log = 0;
	while ((arg >>= 1) > 0)
		log++;
	return log;
}

/*
 * PTB - set the blksize in bytes of the block device. Return 0 for
 *     - success and -ve for failure.
 */
static int
enbd_set_blksize (struct enbd_device *lo, unsigned int arg)
{
	int nbd = lo->nbd;
	if (arg > PAGE_SIZE || arg < 512 || (arg & (arg - 1))) {
		ENBD_ERROR ("blksize too big (%u)\n", arg);
		return -EINVAL;
	}
	lo->blksize = enbd_blksizes[nbd << ENBD_SHIFT] = arg;
	lo->logblksize = log2 (lo->blksize);
        set_blocksize(lo->inode->i_bdev, lo->blksize);
	atomic_set_mask (ENBD_BLKSIZED, &lo->flags);
	return 0;
}

/*
 * PTB - set the size in bytes of the block device. Return 0 for
 *     - success and -ve for failure.
 */
static int
enbd_set_size (struct enbd_device *lo, __u64 arg)
{
	int nbd = lo->nbd;
	lo->bytesize = enbd_bytesizes[nbd << ENBD_SHIFT] = arg;
	lo->size     = enbd_sizes[nbd << ENBD_SHIFT] = arg >> 10;
	lo->sectors  = lo->size << 1;
        if (lo->inode && lo->inode->i_bdev && lo->inode->i_bdev->bd_inode)
                lo->inode->i_bdev->bd_inode->i_size = arg;
        if (lo->disk)
                set_capacity (lo->disk, arg >> 9);
	atomic_set_mask (ENBD_SIZED, &lo->flags);
	return 0;
}

/* WG */
static int
enbd_set_intvl (struct enbd_device *lo, int arg)
{
	if (arg <= 0) {
		ENBD_ERROR ("bad pulse interval/req timeout value (%d)\n", arg);
		return -EINVAL;
	}
	lo->req_timeo = arg;
	return 0;
}

static int
enbd_set_spid (struct enbd_slot *slot, int arg)
{
	short spid = arg;
	if (arg < 0 || arg >= (1 << (sizeof (short) * 8))) {
		ENBD_ERROR ("bad spid value (%d)\n", arg);
		return -EINVAL;
	}
	slot->spid = spid;
	return 0;
}

static int
enbd_set_bufferwr (struct enbd_device *lo, int arg)
{
	if (arg) {
		atomic_set_mask (ENBD_BUFFERWR, &lo->flags);
	} else {
		atomic_clear_mask (ENBD_BUFFERWR, &lo->flags);
	}
	return 0;
}

static int
enbd_set_remote_invalid (struct enbd_device *lo, int arg)
{
	/*
         * PTB we handle the event ourself exactly when it happens
	 * instead of letting the kernel have check_media defined
	 * and doing it there (and reporting 0 to the kernel)
         */
	unsigned long flags;
        int do_invalidate = 0;
	kdev_t dev = mk_kdev (major, lo->nbd << ENBD_SHIFT);

	if (arg == 0) {
		atomic_clear_mask (ENBD_REMOTE_INVALID, &lo->flags);
		return 0;
	}

	write_lock_irqsave (&lo->meta_lock, flags);
	if (!(atomic_test_and_set_mask (&lo->flags, ENBD_REMOTE_INVALID))) {
		/*
                 * PTB this tells the kernel that next open
		 * should cause recheck .. we'll agree not to
		 * say we're happy until VALID is set again
                 */
		atomic_clear_mask (ENBD_VALIDATED, &lo->flags);
		// PTB test removing partitions
                do_invalidate = 1;
        }
	write_unlock_irqrestore (&lo->meta_lock, flags);

        if (do_invalidate) {
		// PTB destroy buffers
		__invalidate_buffers (dev, 1);
		ENBD_ALERT ("invalidating remote on nd%s\n", lo->devnam);
		// PTB - clear buffers now instead of waiting for kernel
		// PTB that will cause requests to start being errored
		invalidate_device (dev, 0);
	} 

	return 0;
}
/*
 * Return the first slot index free when asking for n new ones.
 * If there s no such gap, then ENBD_MAXCONN will be returned.
 * The return is always in the same argument address.
 */
static int
enbd_get_nport (struct enbd_device *lo, int *arg)
{
	int err, nslot, i;

	if (arg == NULL) {
		return -EINVAL;
	}

	nslot = *arg;
	err = copy_from_user ((char *) &nslot, arg, sizeof (int));
	if (err < 0) {
		return err;
	}

	for (i = 0; i < ENBD_MAXCONN; i++) {
		struct enbd_slot *sloti = &lo->slots[i];
		int j;
		if (sloti->file) {
			continue;
		}

		for (j = i; j < ENBD_MAXCONN && j < i + nslot; j++) {
			if (sloti->file)
				break;
		}
		if (j == i + nslot) {

			break;
		}
	}

	err = copy_to_user (arg, (char *) &i, sizeof (int));
	return err;
}


/*
 * PTB - if we're not signed, accept new sig and return success.
 *     - if we are signed, compare the offer and return success if equal,
 *     - and -ve for failure.
 *
 *       @slot the slot we're working on
 *       @sig  the string of signature chars (accessed as int *)
 */
static int
enbd_set_sig (struct enbd_slot *slot, int *sig)
{
	int err = 0;
	int buf[ENBD_SIGLEN / sizeof (int)];
	int islot = slot->i;
	struct enbd_device *lo = slot->lo;

	if (!access_ok (VERIFY_READ, (char *) sig, ENBD_SIGLEN)) {
		ENBD_ALERT ("(%d): failed sigcheck with bad user address %p\n",
                          islot, sig);
		err = -EINVAL;
		return err;
	}
	down (&lo->pid_sem);

	if (slot->pid == 0) {
		slot->pid = current->pid;
	        slot->cli_age = jiffies;
	}
	if (slot->pid != current->pid) {
                if (jiffies > slot->cli_age + 2 * HZ * lo->req_timeo) {
		        ENBD_ALERT
		        ("(%d): dead process %d was setting sig, erasing pid\n",
		        islot, slot->pid);
                        slot->pid = 0;
                } else {
		        ENBD_ALERT
		        ("(%d): live process %d is trying to set sig\n",
		        islot, slot->pid);
                }
		up (&lo->pid_sem);
		return -EINVAL;
	}

	if (!(atomic_read (&lo->flags) & ENBD_SIGNED)) {
		/* PTB first time grab sig */
		copy_from_user ((char *) lo->signature, (char *) &sig[0],
				ENBD_SIGLEN);
		atomic_set_mask (ENBD_SIGNED, &lo->flags);
		up (&lo->pid_sem);
		return 0;
	}
	copy_from_user ((char *) buf, (char *) &sig[0], ENBD_SIGLEN);

	/* PTB test for equality */

	if (memcmp (&buf[0], &lo->signature[0], ENBD_SIGLEN / sizeof (int))
	    != 0) {
		err = -EINVAL;
		up (&lo->pid_sem);
		ENBD_ALERT ("(%d): failed sigcheck wth %d\n", islot, err);
		return err;
	}
	up (&lo->pid_sem);
	err = 0;
	return err;
}

/*
 * PTB - register a userspace buffer to a slot. Return 0 for success
 *     - and -ve for failure. Null arg acts as erase.
 */
static int
enbd_reg_buf (struct enbd_slot *slot, char *buffer)
{

	int err = 0, siz;
	struct enbd_device *lo = slot->lo;

	if (!buffer) {
		slot->flags &= ~ENBD_SLOT_BUFFERED;
		slot->buffer = NULL;
		slot->bufsiz = 0;
		return 0;
	}

	siz = lo->max_sectors << 9;

	/* verify the buffer is in the process space */
	if (!access_ok (VERIFY_WRITE, buffer, siz)) {
		err = -EINVAL;
		return err;
	}
	/* PTB hope the buffer is as big as it should be - FIXME */
	slot->buffer = buffer;
	slot->bufsiz = siz;

	/* PTB let the device bufsiz be min of registered nonzero bufsizes */
	if (!lo->bufsiz) {
		// PTB first time
		lo->bufsiz = siz;
	} else {
		if (lo->bufsiz > siz)
			lo->bufsiz = siz;
	}

	// PTB just in case the buffer really is small, we reset all the
	//     kernels request maxima if we have to adjust the device max
	if (lo->max_sectors < (lo->bufsiz >> 9)) {
		int j;
		lo->max_sectors = lo->bufsiz >> 9;
		for (j = 0; j < ENBD_MAXCONN; j++) {
			enbd_max_sectors[(lo->nbd << ENBD_SHIFT) + j] =
			 lo->max_sectors;
		}
	}

	slot->flags |= ENBD_SLOT_BUFFERED;
	return 0;
}

/*
 * PTB - this unsets the enabled flag on the device and then clears the
 *     - queue for the device.. Call without spinlock.
 *
 *       @lo the nbd device to scan
 */
static int
enbd_disable (struct enbd_device *lo)
{
        struct enbd_md * md = &enbd_md;

	if (!lo || !(atomic_read (&lo->flags) & ENBD_INITIALISED)) {
		ENBD_ALERT("enbd_disable called on bad device\n");
		return 0;
	}

        if (atomic_test_and_clear_mask (&lo->flags, ENBD_ENABLED)) {
                ENBD_ALERT ("disabled device nd%s\n", lo->devnam);
        }

        md->unnotify(md, mk_kdev (major, lo->nbd << ENBD_SHIFT));

        // PTB have to recheck partitions on next open
        if (atomic_test_and_clear_mask (&lo->flags, ENBD_VALIDATED)) {
	        ENBD_ALERT ("invalidated device nd%s\n", lo->devnam);
        }
        return 0;
}


/*
 * PTB - reset the enabled flag on a device and then clear all queues
 * ( call without the spinlock held )  and then enable again.
 */
static void
enbd_reenable (struct enbd_device *lo)
{

	int m, n;

	if (!(atomic_read (&lo->flags) & ENBD_INITIALISED))
		return;
	if (lo->aslot <= 0)
		return;
        if ((atomic_read (&lo->flags) & ENBD_ENABLED))
                return;

	m = enbd_clr_queue (lo);
	// PTB - have to call clr_kernel_queue without the io_spinlock held
	n = enbd_clr_kernel_queue (lo);

        enbd_enable(lo);
}

/*
 *  This function launches a thread which wakes for a signal to reenable
 *  the device, and then sets the timer to deleiver the signal.
 */
static int
enbd_reenable_delay (struct enbd_device *lo, int delay)
{
	write_lock (&lo->meta_lock);
	if (lo->reenable_time == 0)
		lo->reenable_time = jiffies + delay * HZ;
	write_unlock (&lo->meta_lock);
	return 0;
}



/*
 * PTB - drains device queue. Disables device.
 * At least rollback (which we call takes both the io spinlock and our
 * spinlock, so we can hold neither when we are called. Also
 * invalidate buffers, on request of Rogier Wolff.
 */
static int
enbd_soft_reset (struct enbd_device *lo)
{
	int j;
        const int max_clrq_retries = 100;
	if (!(atomic_read (&lo->flags) & ENBD_INITIALISED) || lo->nslot <= 0) {
		return -EINVAL;
	}
	/*
         * PTB We push back the requests in the slot, in order to be able to
	 * vamoosh them in a moment. This is a race, surely? We ought to
	 * do this atomically or dsiable the slots first.
         */
	for (j = 0; j < lo->nslot; j++) {
		struct enbd_slot *slot = &lo->slots[j];
		enbd_rollback_all (slot);
	}
	// PTB disable unsets the nabled flag and clears the queue
	enbd_disable (lo);
        for (j = 0; j < max_clrq_retries; j++) {
	        int m = enbd_clr_queue (lo);
                if (m <= 0)
                        break;
        }
	// PTB this would unsign the device: lo->flags &= ~ENBD_SIGNED;

	/*
         * PTB put back invalidate buffers for use when called from
	 * clr_sock from enbd_release on request of Rogier Wolff.
         */
	for (j = 0; j < lo->nslot; j++) {
		invalidate_buffers (mk_kdev(major, (lo->nbd << ENBD_SHIFT) + j));
	}
	return 0;
}

/*
 * PTB - added a device/module reset for tidyness in face of rampant hacking
 *     - this does a soft_reset of all devices, followed bu a clr sock
 *     - on each, and then clears the kernel queue. It unsets the
 *     - enabled flag on each device.
 *       We have to be called without either the spinlock or the
 *       spinlock held, as we call soft_reset which takes both, as
 *       does clr_sock
 */
int
enbd_hard_reset (struct enbd_device *lo)
{
	int i;
	int err = 0;

	for (i = 0; i < MAX_NBD; i++) {
		struct enbd_device *lo = &enbd_dev[i];
		int j;
                if (!lo->file || !lo->inode)
                      continue;
                if (!(atomic_read(&lo->flags)&ENBD_INITIALISED))
                      continue;
		enbd_soft_reset (lo);
		for (j = 0; j < lo->nslot; j++) {
			struct enbd_slot *slot = &lo->slots[j];
			//  PTB this takes the io spinlock and our spinlock.
			enbd_clr_sock (slot);
		}
	        // PTB - call clr_kernel_queue without the io_spinlock held
	        enbd_clr_kernel_queue (lo);
	}

	return err;
}

static int
indirect_ioctl_load (struct request *req, int cmd, char * buf)
{

        int size;
        int err;
        struct enbd_ioctl *remote_ioctl = enbd_remote_ioctl.remote;

        if (!remote_ioctl)
                return -EINVAL;

	size = remote_ioctl->size_user (cmd, buf);

	if (size < 0) {
                // PTB unauthorized ioctl
                err = -EINVAL;
                goto error_out;
	}

        if (size == 0) {
                 // PTB we never use the nbd devices small buffer now
                 req->nr_sectors = 0;
                 req->buffer = NULL;
                 return size;
        }

        // PTB we have to use an extra buffer or else block
	// here and rendezvous directly with the get_req call
        req->nr_sectors = (size + 511) >> 9;
        req->buffer = kmalloc(req->nr_sectors << 9, GFP_KERNEL);

	if (!req->buffer) {
                 err = -ENOMEM;
                 goto error_out;
	}

	if (_IOC_DIR (cmd) & _IOC_WRITE) {
		err =
		 remote_ioctl->cp_from_user (cmd, req->buffer, buf, size);
		if (err < 0) {
		        kfree (req->buffer);
                        goto error_out;
		}
	}
        return size;

error_out:
        req->buffer = NULL;
        req->nr_sectors =0;
        return err;
}

static int
indirect_ioctl_store (struct request *req, int cmd, char * buf,
		      int size)
{
        int err;
        struct enbd_ioctl * remote_ioctl = enbd_remote_ioctl.remote;

        if (!remote_ioctl)
                return -EINVAL;

        if (size <= 0)
                return size;

	// PTB if we are reading, it should be to the local buffer
	// PTB the buffer points at a kmalloced area
        
        if (!req->buffer)
                return -ENOMEM;
	err = remote_ioctl->cp_to_user (cmd, buf, req->buffer, size);
	kfree (req->buffer);
	if (err < size)
		return -ENOMEM;
	return size;
}

static int
do_nbd_remote_ioctl(struct enbd_device *lo, int minor, int cmd, unsigned long arg) {

	unsigned start_time, timeout;
	size_t size;
        int err;
        struct request * req;
        struct completion x;
        struct enbd_acct * acct = &lo->acct;

	/*
         * PTB here we have to treat remote ioctls. We should probably make
	 * a request and put it on the local queue, but where can we get
	 * the request from? We might have to keep one in reserve.
	 * That's not a bad idea, because
	 * we generate it here and we delete it here, and the daemon code
	 * is all set up to read that sort of thing. So that's what we do ...
         */

	timeout = lo->req_timeo * HZ;
	start_time = jiffies;

        while (!(req = blk_get_request(lo->q,WRITE,0))) {
		if (jiffies >= start_time + timeout) {
			// PTB it takes too long
			ENBD_ALERT
			 ("took too long to get a spare ioctl req: TIMEOUT\n");
			return -ETIME;
		}
		err = interruptible_sleep_on_timeout (&lo->req_wq,
						      start_time +
						      timeout - jiffies);
	}

	set_rq_type(req, IOCTL);

	req->errors = 0;

	// PTB this is the fixed-up command
	req->special = (void *) cmd;

	/*
         * PTB this is (arg if it is direct, else) the address of a local buffer
	 * PTB we need to store the arg or its dereference somewhere local
	 * for a while until the cnb-client thread can enter and pick it
	 * up. The alternative is to block the ioctl here until it is
	 * picked up, which IS possible.
         */
        
	if (_IOC_DIR (cmd) & _IOC_READ) {
		// PTB indirect
                size = indirect_ioctl_load (req, cmd, (char *)arg);
                if (size < 0) {
                    goto end;
                }
	} else {
		// PTB direct - we just need to remember the value
		size = 0;
		req->buffer = (char *) arg;
	}

	// PTB point the request buffer vaguely in the direction of where
	// the data is, but it does not matter.
	req->rq_disk = lo->disk;

	// PTB we queue the request for treatment and wait till treated
        init_completion(&x);
        req->waiting = &x;
	enbd_enqueue (lo, req);

        for (err = 0; err <= 0; err = wait_for_completion_timeout(&x, 1)) {

                /*
                 * PTB on slot or queue? Don't know.  Only want
                 * to vamoosh it if its on queue, not slot
                 */
	        struct list_head *pos;
                int time_left = start_time + timeout - jiffies;
                // PTB del_req will be run with queue_lock held
                static void delete_req(void) {

                        // PTB change countq only under this lock
                        if (! (req->flags & REQ_SPECIAL)) {
                                write_lock(&lo->altqueue_lock);
                                // PTB reverse inadvertent accounting in enqueue
                                atomic_dec (&acct->countq[rq_data_dir(req)]);
                                write_unlock(&lo->altqueue_lock);
                        }

			list_del_init (&req->queuelist);

			req->errors = -ETIME;
	                if (req->nr_sectors > 0 && req->buffer) {
			        kfree (req->buffer);
                                req->buffer = NULL;
			}
                };

		if (time_left > 0)
                        continue;

                // PTB find req on list and delete it
                write_lock (&lo->queue_lock);
	        list_for_each (pos, &lo->queue) {

			if (req != list_entry (pos, struct request, queuelist)) 
                                continue;

                        delete_req ();
			write_unlock (&lo->queue_lock);
			ENBD_ALERT
			  ("took too long to treat queued ioctl: TIMEOUT\n");
			err = -ETIME;
                        goto end;
                }
	        write_unlock (&lo->queue_lock);

	} // end while loop


	if (_IOC_DIR (cmd) & _IOC_READ) {
                err = indirect_ioctl_store(req, cmd, (char *)arg, size);
                if (err < 0) {
                    goto end;
                }
	}

	if (req->errors != 0) {
                err = req->errors;
		err = err < 0 ? err : -EINVAL;
	} else {
                err = 0;
        }
end:
        blk_put_request(req);
	return err;

}

static int
find_slot (struct enbd_device *lo, int pid)
{
        int i;
	// go search
	for (i = 0; i < ENBD_MAXCONN; i++) {
                struct enbd_slot * slot =  &lo->slots[i];
		if (slot->pid == pid)
			break;
	}
	if (i < ENBD_MAXCONN)
		return i;		// found it
	// not found
	return -1;
}

static int
fixup_slot (struct enbd_device *lo, int islot, unsigned int cmd, unsigned long *arg)
{
	int intval;

	switch (cmd) {

		// PTB get slot info from parameter if not given
	  case ENBD_CLEAR_SOCK:
	  case MY_NBD_CLR_REQ:
	  case MY_NBD_ERR_REQ:
		// see if we match a known slot pid
		if (arg && *arg == 0) {
			islot = find_slot (lo, current->pid);
			if (islot >= 0)
				return islot;
		}
		ENBD_ALERT
		 ("failed to find slot for pid %d for ioctl %x arg %lx\n",
		  current->pid, cmd, *arg);
		return islot = -1;
		break;

		// PTB get the slot from the 16 high bits
	  case ENBD_SET_SOCK:
	  case MY_NBD_SET_SPID:
		intval = *arg >> ((sizeof (int) - sizeof (short)) * 8);
		intval &= (1 << (sizeof (short) * 8)) - 1;
		if (intval == 0) {
			// no clue in the pid high bits. Search
			islot = find_slot (lo, current->pid);
			if (islot >= 0) {
				// PTB change arg !!
				*arg &= (1 << (sizeof (short) * 8)) - 1;
				return islot; // found it
			}
			// not found
		}
		ENBD_ALERT
                    ("failed to find slot for pid %d for ioctl %x arg %lx\n",
		    current->pid, cmd, *arg);
		return islot = -1;
		break;

	  case MY_NBD_GET_REQ:
	  case MY_NBD_ACK:
		islot = find_slot (lo, current->pid);
		if (islot >= 0)
			return islot;
		ENBD_ALERT
		 ("failed to find slot for pid %d for ioctl %x arg %lx\n",
		  current->pid, cmd, *arg);
	        return islot;
		break;

	  case MY_NBD_REG_BUF:
	  case MY_NBD_SET_SIG:
		islot = find_slot (lo, current->pid);
		if (islot >= 0)
			return islot;
		/*
                 * PTB Otherwise they passed a buffer
		 * and the slot number is in the first 4B
		 * We need some magic here for safety!
		 * set sig is the only call that really needs
		 * to send its pid!
                 */

                intval = -1;
		if (!arg || !*arg || get_user (intval, (int *) *arg)
		|| intval <= 0
                || intval > ENBD_MAXCONN) {
	                ENBD_ALERT
                        ("failed to find slot for pid %d ioctl %x arg %lx\n",
		                current->pid, cmd, *arg);
			return islot = -1;
		}
		islot = intval - 1;

		// PTB CHANGE ARG !!!!
		*arg += sizeof (int);
		return islot;
		break;
	}

	return islot = -1;
}

/*
 * PTB - generic ioctl handling
 */
static int
enbd_ioctl (struct inode *inode, struct file *file,
	   unsigned int cmd, unsigned long arg)
{
	struct enbd_device *lo
                  = NULL;	// PTB device pointer
	int minor = -1;		// PTB minor on which we got the ioctl
	int islot = -1;		// PTB slot number 0, 1, ...
	int nbd   = -1;		// PTB the count for the device group
	struct enbd_slot *slot
                  = NULL;	// PTB slot pointer
        int err;
        struct enbd_acct *acct = &lo->acct;

	if (!capable(CAP_SYS_ADMIN)) {
		ENBD_ERROR ("caller must be root.\n");
		return -EPERM;
	}
	if (!inode) {
		ENBD_ERROR ("given bad inode.\n");
		return -EINVAL;
	}
	if (major (inode->i_rdev) != major) {
		ENBD_ERROR ("pseudo-major %d != %d\n",
			   major (inode->i_rdev), major);
		return -ENODEV;
	}
	minor = minor (inode->i_rdev);
	nbd = minor >> ENBD_SHIFT;
	if (nbd >= MAX_NBD) {
		ENBD_ERROR ("tried to open too many devices, %d\n", minor);
		return -ENODEV;
	}
	lo = &enbd_dev[nbd];
	lo->harderror = 0;
	islot = minor % ENBD_MAXCONN - 1;

        /*
         * PTB fixup breakage >= 2.5.44 caused by not being allowed to talk to
         * minors. We deduce the slot number from hints in the call.
         * Or we match against the known pids.
         */
        if (islot < 0) {
                islot = fixup_slot(lo, islot, cmd, &arg);
        }
        if (islot >= 0)
                slot = & lo->slots[islot];
      

	// PTB these are all always local ioctls
	switch (cmd) {
		int err;
		int intval;
                int do_reenable;

	  case ENBD_CLEAR_SOCK:
		if (islot < 0) {
		        ENBD_ALERT ("CLEAR_SOCK called on full device nd%s arg %lx\n",
				   lo->devnam, arg);
		        return -EINVAL;
                }
		err = enbd_clr_sock (slot);
		return err;

	  case ENBD_SET_SOCK:
		if (islot < 0) {
			ENBD_ALERT ("SET_SOCK called on full device nd%s arg %lx\n",
			        lo->devnam, arg);
			return -EINVAL;
                }
		err = enbd_set_sock (slot, arg);
		return err;

	  case BLKBSZGET:
                // PTB The kernel should intercept this
	        ENBD_ALERT ("attempted get_blksize with BLKBSZGET\n");
                return -EINVAL;

          case ENBD_GET_BLKSIZE:
		if (!(atomic_read (&lo->flags) & ENBD_BLKSIZED)) {
			return -EINVAL;
		}
		err = put_user (lo->blksize, (long *) arg);
		return err;

	  case BLKBSZSET:
                // PTB The kernel should have intercepted this
	        ENBD_ALERT ("attempted set_blksize with BLKBSZSET\n");
                return -EINVAL;

          case ENBD_SET_BLKSIZE:
                if (!arg)
                        return -EINVAL;
		intval = -1;
                if (get_user (intval, (int *)arg))
                        return -EFAULT;
		if (intval == -1) {
			ENBD_ALERT ("BLKBSZSET got %d from user\n", intval);
		}
	        err = enbd_set_blksize (lo, intval);
	        return err;

	  case ENBD_SET_SIZE:
		err = enbd_set_size (lo, (__u64) arg);
		return err;

	  case ENBD_SET_SECTORS:
		err = enbd_set_size (lo, ((__u64) arg) << 9);
		return err;

	  case MY_NBD_SET_INTVL:	/* WG */
		err = enbd_set_intvl (lo, arg);
		return err;

	  case MY_NBD_SET_SPID:
		if (islot < 0) {
			ENBD_ALERT ("SET_SPID called on full device nd%s\n",
				        lo->devnam);
			return -EINVAL;
		}
		err = enbd_set_spid (slot, arg);
		return err;

	  case MY_NBD_SET_BUFFERWR:
		err = enbd_set_bufferwr (lo, arg);
		return err;

	  case MY_NBD_REG_BUF:	/* PTB register your buffer per socket here */
		if (!arg) {
			/* PTB serves as existence check for this ioctl */
			return 0;
		}
		if (islot < 0) {
			ENBD_ALERT ("REG_BUF called on full device nd%s\n",
				        lo->devnam);
		        return -EINVAL;
		}
		err = enbd_reg_buf (slot, (char *) arg);
		return err;

	  case MY_NBD_SET_SIG:
		if (islot < 0) {
		       ENBD_ALERT ("SET_SIG called on full device nd%s\n",
		       	   lo->devnam);
		       return -EINVAL;
		}
		err = enbd_set_sig (slot, (int *) arg);
		return err;

	  case MY_NBD_GET_REQ:
		if (islot < 0) {
			ENBD_ALERT ("GET_REQ called on full device nd%s\n",
		            lo->devnam);
			return -EINVAL;
		}
                if (arg < 4096) {
                    arg = (unsigned)slot->buffer;
                    if (!arg)
                        return -EINVAL;
                }
		err = enbd_get_req (slot);
		return err;

	  case MY_NBD_GET_NPORT:
		err = enbd_get_nport (lo, (int *) arg);
		return err;

	  case MY_NBD_CLR_REQ:
		if (islot < 0) {
			ENBD_ALERT ("CLR_REQ called on full device nd%s\n",
				   lo->devnam);
			return -EINVAL;
		}
		enbd_rollback_all (slot);
		return 0;

	  case MY_NBD_ERR_REQ:
		if (islot < 0) {
			ENBD_ALERT ("ERR_REQ called on full device nd%s\n",
				   lo->devnam);
			return -EINVAL;
		}
		enbd_error_all (slot);
		return 0;

	  case MY_NBD_SYNC:

                // PTB maybe run the reenable function
                do_reenable = 0;
                write_lock(&lo->meta_lock);
                if (lo->reenable_time != 0
                        && time_before(lo->reenable_time,jiffies)) {
                        lo->reenable_time = 0;
                        do_reenable = 1;
                }
                write_unlock(&lo->meta_lock);
                if (do_reenable)
                        enbd_reenable(lo);

		// PTB error too old reqs if show_errs set, else roll them back
		enbd_rollback_old (lo);

                // PTB opportunity to calculate speed
		enbd_set_speed (lo);

		return 0;

	  case MY_NBD_ACK:
		if (islot < 0) {
			ENBD_ALERT ("ENBD_ACK called on full device nd%s\n",
				   lo->devnam);
			return -EINVAL;
		}
		err = enbd_ack (slot);
		return err;

		/* let this be compiled in always - it's useful. PTB */
	  case ENBD_PRINT_DEBUG:
		ENBD_INFO("device %d: hd = %p, tl = %p, in = %d, out = %d\n",
		  minor,
                  list_head (&lo->queue, struct request, queuelist),
		  list_tail (&lo->queue, struct request, queuelist),
		  atomic_read (&acct->requests_in[READ]) +
		    atomic_read (&acct->requests_in[WRITE]),
		  atomic_read (&acct->requests_out[READ]) +
		    atomic_read (&acct->requests_out[WRITE])
                  );
		err = 0;
		return err;
	  case ENBD_HARD_RESET:	/* PTB - debugging */
		err = enbd_hard_reset (lo);
		return err;

	  case ENBD_RESET:	/* PTB - debugging */
		err = enbd_soft_reset (lo);
		// PTB we reenable in 5s
                enbd_reenable_delay(lo, 5);
		return err;

	  case ENBD_SET_MD5SUM:	/* PTB - change to do/plead md5summing */
		if (arg) {
			atomic_set_mask (ENBD_MD5SUM, &lo->flags);
		} else {
			atomic_clear_mask (ENBD_MD5SUM, &lo->flags);
		}
		err = 0;
		return err;

	  case MY_NBD_SET_SHOW_ERRS:	/* PTB/WG - change show error status */
		if (arg) {
			atomic_set_mask (ENBD_SHOW_ERRS, &lo->flags);
		} else {
			atomic_clear_mask (ENBD_SHOW_ERRS, &lo->flags);
		}
		return 0;

          case MY_NBD_SET_DIRECT:	/* PTB - change o_direct status */
	        if (arg) {
	                atomic_set_mask (ENBD_DIRECT, &lo->flags);
	        } else {
	                atomic_clear_mask (ENBD_DIRECT, &lo->flags);
	        }
	        return 0;

	  case MY_NBD_INVALIDATE:
		err = enbd_set_remote_invalid (lo, (int) arg);
		return err;

	  case ENBD_SET_PF_MEMALLOC:
		if (arg) {
			current->flags |= PF_MEMALLOC;
		} else {
			current->flags &= ~PF_MEMALLOC;
		}
		return 0;
        } // PTB endsw

	// PTB these are the standard ioctls, and we might get them from
	// the other side

	switch (cmd) {
		int err;
                int intval;

          case BLKROSET:		/* PTB - change ro status */
                if (get_user(intval, (int*)arg))
                        return -EFAULT;
                // PTB local flags
                enbd_set_read_only(lo, intval);
	        return 0;

          case BLKROGET:
                intval =  enbd_read_only(lo);
                return put_user(intval, (int*)arg);

	  case BLKFLSBUF:
		enbd_maybe_sync_sync (lo);	// PTB normally fsync_dev
		// PTB device likely has buffers or caches in kernel
		invalidate_buffers (inode->i_rdev);
#ifndef NO_BUFFERED_WRITES
		if (atomic_read (&lo->flags) & ENBD_BUFFERWR) {
			// PTB got this from rd.c
                        // PTB destroy buffers
			__invalidate_buffers (inode->i_rdev, 1);
		}
#endif		/* NO_BUFFERED_WRITES */
		return 0;

	  case HDIO_GETGEO:
		if (!arg) {
			return -EINVAL;
		} else {
			struct hd_geometry *geo =
			 (struct hd_geometry *) arg;
			int sectors = enbd_sizes[nbd << ENBD_SHIFT] << 1;
			unsigned short c;
			unsigned char h, s;
			if (sectors < (1 << 22)) {
				h = 4;
				s = 16;
				c = sectors >> 6;
			} else {
				h = 255;
				s = 63;
				c = (sectors / h) / s;
			}
			err = 0;
			if ((err = put_user (c, &geo->cylinders), err < 0)
			    || (err = put_user (h, &geo->heads), err < 0)
			    || (err = put_user (s, &geo->sectors), err < 0)
			    || (err = put_user (h, &geo->start), err < 0)) {
				return err;
			}
		} 
		return 0;

#ifndef BLKMDNTFY
#define BLKMDNTFY _IOW(0x12,133,sizeof(int))
#endif
              case BLKMDNTFY:
                ENBD_INFO ("received BLKMDNTFY, am now in raid %x\n",
                        (unsigned) arg);
                enbd_md.inc(&enbd_md);
                return 0;

#ifndef BLKMDUNTFY
#define BLKMDUNTFY _IOW(0x12,134,sizeof(int))
#endif
              case BLKMDUNTFY:
                ENBD_INFO ("received BLKMDUNTFY, now out of raid %x\n",
                        (unsigned) arg);
                enbd_md.dec(&enbd_md);
                return 0;

#ifndef BLKMDRGTR
#define BLKMDRGTR _IOW(0x12,135,sizeof(unsigned long))
#endif
              case BLKMDRGTR:
                enbd_md.reg(&enbd_md, (int(*)(kdev_t, int))arg);
                return 0;

	} // PTB endsw

        if (enbd_remote_ioctl.remote != NULL) {
                struct enbd_ioctl *remote_ioctl = enbd_remote_ioctl.remote;

	        if (remote_ioctl->convert_inplace (&cmd) < 0) {
		        ENBD_ALERT ("unauthorized ioctl %#x\n", cmd);
		        return -EINVAL;
	        }

                err = do_nbd_remote_ioctl(lo, minor, cmd, arg);
                return err;
        } 
        return -EINVAL;
}

/*
 * PTB - release the device. This happens when the last process closes
 * or dies.
 */
static int
enbd_release (struct inode *inode, struct file *file)
{
	struct enbd_device *lo;
	int dev;
	int nbd;
	int islot;

	if (!inode) {
		ENBD_ALERT ("null inode.\n");
		return -ENODEV;
	}
	dev = minor (inode->i_rdev);
	nbd = dev >> ENBD_SHIFT;

	if (nbd >= MAX_NBD) {
                // PTB impossible
		ENBD_ALERT ("too many open devices.\n");
		return -ENODEV;
	}

	lo = &enbd_dev[nbd];

	islot = dev % ENBD_MAXCONN - 1;

	// PTB it is a daemon closing the slot?
        if (islot >= 0 || (islot = find_slot(lo, current->pid), islot >= 0)) {
		struct enbd_slot *slot = &lo->slots[islot];
		--slot->refcnt;
		if (slot->pid == current->pid) {

			enbd_clr_sock (slot);
		        ENBD_ALERT ("(%d): erasing slot pid %d\n", islot, slot->pid);
			slot->pid = 0;
			if (slot->refcnt > 0) {
				ENBD_ALERT
				 ("slot owner process %d released slot nd%s%d while not last\n",
				  slot->pid, lo->devnam, islot + 1);
			}
		}
	}

	/* POSSIBLE change socket here PTB */

	atomic_dec (&lo->refcnt);

	// PTB invalidate buffers on last close if show_err set
	if (atomic_read (&lo->refcnt) <= 0 || !module_is_live(THIS_MODULE)) {
                struct enbd_seqno * seqno_out = &lo->seqno_out;
		//invalidate_buffers (lo->inode->i_rdev);     
                if (atomic_read (&lo->flags) & ENBD_SHOW_ERRS) {
	                invalidate_buffers (mk_kdev (major, nbd << ENBD_SHIFT));
                }
                // PTB in any case the daemons are dead!
		lo->bufsiz = 0;
		seqno_out->reset(seqno_out);
	}

        if (file
        && (file->f_flags & O_DIRECT)
        // PTB we set this to show we made iobuf
        && (file->f_flags & O_NOFOLLOW))  {
                file->f_flags &= ~(O_DIRECT|O_NOFOLLOW);
        }

        return 0;
}

static int
enbd_media_changed(struct gendisk *disk) {
        struct enbd_device *lo = disk->private_data;
        if (!lo || lo->magic != ENBD_DEV_MAGIC)
                return 0;
        ENBD_ALERT("enbd_media_changed called on nd%s\n", lo->devnam);
        return (atomic_read (&lo->flags) & ENBD_VALIDATED) == 0;
}

static int
enbd_revalidate(struct gendisk *disk) {
        struct enbd_device *lo = disk->private_data;
	unsigned long flags;
        int err = -EINVAL;

        if (!lo || lo->magic != ENBD_DEV_MAGIC){
                return -EINVAL;
        }
	        // PTB reenable part
        ENBD_ALERT("revalidate called on nd%s\n", lo->devnam);
        write_lock_irqsave (&lo->meta_lock, flags);
        if (! (atomic_read (&lo->flags) & ENBD_REMOTE_INVALID)
                &&    (atomic_read (&lo->flags) & ENBD_ENABLED)) {
                atomic_set_mask (ENBD_VALIDATED, &lo->flags);
                err = 0;
        }
	write_unlock_irqrestore (&lo->meta_lock, flags);

        return err;
}

static struct block_device_operations enbd_blkops = {
        owner:                  THIS_MODULE,
	open:                   enbd_open,
	release:                enbd_release,
	ioctl:                  enbd_ioctl,
        media_changed:          enbd_media_changed,
        revalidate_disk:        enbd_revalidate,
};

static struct gendisk *
enbd_find (dev_t dev, int *part, void *data)
{
	struct enbd_device *lo = data;
        if (!lo)
                return NULL;
	if (lo->magic != ENBD_DEV_MAGIC)
                return NULL;
        if (!lo->disk)
                return NULL;
        if (part)
                ENBD_ALERT("enbd_find called with part = %#x\n", (unsigned)*part);
        if (part && (*part < 0 || *part >= ENBD_MAXCONN))
                return NULL;
	return get_disk (lo->disk);
}


static int
enbd_set_disk (struct enbd_device *lo, unsigned first_minor, unsigned npart)
{
        struct gendisk * disk = lo->disk;
	if (!disk)
		lo->disk = disk = alloc_disk (npart);
	if (disk) {
		disk->major        = major;
		disk->first_minor  = first_minor;
		disk->fops         = &enbd_blkops;
		disk->private_data = lo;
		disk->queue        = lo->q;
		sprintf (disk->disk_name, "nd%s", lo->devnam);
                // have to set minors (or capacity) to 1 (0) to avoid check disk
                set_capacity (disk, 0);
		add_disk (disk);
                blk_register_region(MKDEV(major, first_minor),
                        npart, THIS_MODULE, enbd_find, NULL, lo);
		set_capacity (disk, lo->bytesize >> 9);
                // we should rescan later. From userland?
                return 0;
	}

	ENBD_ERROR ("Insufficient memory for partition structs\n");
        return -ENOMEM;
}

/*
 * Pavel - And here should be modules and kernel interface 
 *  (Just smiley confuses emacs :-)
 */


static void
enbd_reset(struct enbd_device *lo, int i) {

        int j;

        if (i < 0 || i >= MAX_NBD)
            return;
	lo->magic = ENBD_DEV_MAGIC;
	strncpy (lo->devnam, device_letter (i), 4);
	for (j = 0; j < ENBD_MAXCONN; j++) {	/* PTB */
		struct enbd_slot *slot = &lo->slots[j];
		slot->lo = lo;
		slot->i = j;
		INIT_LIST_HEAD (&slot->queue);
	}
	lo->blksize = 1024;	/* PTB 132 */
	lo->logblksize = 10;	/* PTB */
	lo->bytesize = 0x7fffffff00000;	/* PTB 132 */
	lo->size = 0x7fffffff;	/* PTB (bytesizes >> 10) */
	lo->sectors = 0xfffffffe;	/* PTB sectors */
	lo->nbd = i;
	lo->req_timeo = ENBD_REQ_TIMEO;	/* PTB default pulse intvl */
	lo->max_sectors = buf_sectors;

        lo->enable = enbd_enable;
        lo->reset = enbd_reset;
        lo->disable = enbd_disable;
        lo->read_only = enbd_read_only;
        lo->set_speed = enbd_set_speed;
        lo->hard_reset = enbd_hard_reset;
        lo->soft_reset = enbd_soft_reset;
        lo->reenable_delay = enbd_reenable_delay;

	INIT_LIST_HEAD (&lo->queue);
	INIT_LIST_HEAD (&lo->altqueue);
	init_waitqueue_head (&lo->wq);
	init_waitqueue_head (&lo->req_wq);
        init_MUTEX(&lo->pid_sem);
	rwlock_init (&lo->queue_lock);
	rwlock_init (&lo->altqueue_lock);
	rwlock_init (&lo->meta_lock);
	for (j = 0; j < ENBD_MAXCONN; j++) {
		enbd_blksizes[i * ENBD_MAXCONN + j] = lo->blksize;
		enbd_bytesizes[i * ENBD_MAXCONN + j] = lo->bytesize;
		enbd_sizes[i * ENBD_MAXCONN + j] = lo->size;
		enbd_max_sectors[i * ENBD_MAXCONN + j] = lo->max_sectors;
	}
        enbd_init_seqno(&lo->seqno_out);
        enbd_init_speed(&lo->rspeed);
        enbd_init_speed(&lo->wspeed);
        enbd_init_speed(&lo->tspeed);

        // PTB queuue has alreay been initialized, or will be
        lo->q = enbd_queue;

	if (md5sum) {
		atomic_set_mask (ENBD_MD5SUM, &lo->flags);
	}
	if (sync_intvl) {
		atomic_set_mask (ENBD_SYNC, &lo->flags);
	}
	if (show_errs) {
		atomic_set_mask (ENBD_SHOW_ERRS, &lo->flags);
	}
        if (direct) {
                atomic_set_mask (ENBD_DIRECT, &lo->flags);
        }
	if (buffer_writes) {
		atomic_set_mask (ENBD_BUFFERWR, &lo->flags);
	}
        if (merge_requests) {
                atomic_set(&lo->merge_requests, merge_requests);
        }
}

#ifdef MODULE
MODULE_AUTHOR ("Peter T. Breuer, Andres Marin");
MODULE_DESCRIPTION ("Enhanced Network Block Device " ENBD_VERSION);
MODULE_LICENSE ("GPL");
#endif		/* MODULE */

// PTB we steal these from the queue struct at init
static merge_requests_fn *ll_merge_requests_fn;
static merge_request_fn *ll_front_merge_fn;
static merge_request_fn *ll_back_merge_fn;

/* PTB -
 * These functions are needed when the kernel does request merging in
 * order to stop it making requests that are bigger than our buffer.
 *
 * To turn OFF merging (once these functions are in place), set
 * merge_requests=0.
 */
static int
enbd_merge_requests_fn (request_queue_t * q, struct request *req,
		       struct request *req2)
{
	struct enbd_device *lo = req->rq_disk->private_data;

	if (!atomic_read(&lo->merge_requests))
		return 0;

	if (!ll_merge_requests_fn)
		return 0;

	if (req->nr_sectors + req2->nr_sectors > lo->max_sectors)
		return 0;

	if (req->nr_sectors + req2->nr_sectors >
	    ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9)))
		return 0;

	return ll_merge_requests_fn (q, req, req2);
}
static int
enbd_front_merge_fn (request_queue_t * q, struct request *req, struct bio * bio)
{
	struct enbd_device *lo = req->rq_disk->private_data;

	if (!atomic_read(&lo->merge_requests))
		return 0;

	if (!ll_front_merge_fn)
		return 0;

	if (req->nr_sectors > lo->max_sectors)
		return 0;

	if (req->nr_sectors > ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9)))
                return 0;

	return ll_front_merge_fn (q, req, bio);
}
static int
enbd_back_merge_fn (request_queue_t * q, struct request *req,
		   struct bio * bio)
{
	struct enbd_device *lo = req->rq_disk->private_data;

	if (!atomic_read(&lo->merge_requests))
		return 0;

	if (!ll_back_merge_fn)
		return 0;

	if (req->nr_sectors > lo->max_sectors)
		return 0;

	if (req->nr_sectors >
	    ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9))) return 0;

        return ll_back_merge_fn (q, req, bio);
}

// PTB - and now to play with the sysctl interface ...
static struct ctl_table_header *enbd_table_header;
// the above was set by the register call of the root table
static ctl_table enbd_table[] = {
	{1, "rahead",
	 &rahead, sizeof (int), 0644, NULL, &proc_dointvec},
	{2, "plug",
	 &plug, sizeof (int), 0644, NULL, &proc_dointvec},
	{3, "sync_intvl",
	 &sync_intvl, sizeof (int), 0644, NULL, &proc_dointvec},
	{4, "merge_requests",
	 &merge_requests, sizeof (int), 0644, NULL, &proc_dointvec},
	{5, "md5sum",
	 &md5sum, sizeof (int), 0644, NULL, &proc_dointvec},
	{8, "md5_on_threshold",
	 &md5_on_threshold, sizeof (int), 0644, NULL, &proc_dointvec},
	{9, "md5_off_threshold",
	 &md5_off_threshold, sizeof (int), 0644, NULL, &proc_dointvec},
	{0}
};
static ctl_table enbd_dir_table[] = {
	{6, "enbd", NULL, 0, 0555, enbd_table},
	{0}
};
static ctl_table enbd_root_table[] = {
	{CTL_DEV, "dev", NULL, 0, 0555, enbd_dir_table},
	{0}
};

#ifdef CONFIG_DEVFS_FS
static devfs_handle_t devfs_handle;
static devfs_handle_t devfs_handles[MAX_NBD];
#endif


int __init
enbd_init (void)
{
	int i;
	int err = 0;
	struct proc_dir_entry *res;

	ENBD_INFO ("Network Block Device originally by pavel@elf.mj.gts.cz\n");
	ENBD_INFO ("Network Block Device port to 2.0 by ptb@it.uc3m.es\n");
	ENBD_INFO ("Network Block Device move networking to user space by "
		  "amarin@it.uc3m.es\n");
	ENBD_INFO ("Enhanced Network Block Device " ENBD_VERSION " by "
		  "ptb@it.uc3m.es\n");

        enbd_queue = kmalloc(sizeof(*enbd_queue), GFP_KERNEL);
        if (!enbd_queue)
            return -ENOMEM;

        for (i = 0; i < MAX_NBD; i++) {
	        struct enbd_device *lo = &enbd_dev[i];
                struct gendisk *disk = alloc_disk(ENBD_MAXCONN);
	        memset (lo, 0, sizeof (*lo));
                if (disk)
                    lo->disk = disk;
        }

	if (register_blkdev (major, "nbd", &enbd_blkops)) {
		ENBD_ERROR ("Unable to register major number %d for NBD\n",
			   major);
		return -EIO;
	}
#ifdef MODULE
	ENBD_INFO ("registered device at major %d\n", major);
#endif


// PTB - set up kernel queue struct with default methods
	blk_init_queue (enbd_queue, do_nbd_request, &enbd_lock);

        blk_queue_max_sectors(enbd_queue, buf_sectors);	/* max per request */

/*
 * PTB - I think that put:
 *     - q->plug_device_fn    = generic_plug_device    (static ll_rw_blk)
 *     - q->plug_tq.routine   = generic_unplug_device  (static ll_rw_blk)
 *     - q->back_merge_fn     = ll_back_merge_fn       (static ll_rw_blk)
 *     - q->front_merge_fn    = ll_front_merge_fn      (static ll_rw_blk)
 *     - q->merge_requests_fn = ll_merge_requests_fn   (static ll_rw_blk)
 *     - q->request_fn        = do_nbd_request         (param)
 */

/*
 * PTB - we have to do some more init magic in 2.4.*. This says that we
 *     - take all stuff off the kernel queue before processing it, so in
 *     - particular iti s OK for kernel to do merges with the queue head.
 *       blk_queue_headactive (enbd_queue, 0);
 */

/*
 * LA - moved the next #if higher;
 *    - kernel 2.2.* doesn't know about plug_device_fn
 */

	// PTB control merge attempts so we do not overflow our buffer
	ll_merge_requests_fn = enbd_queue->merge_requests_fn;
	ll_front_merge_fn    = enbd_queue->front_merge_fn;
	ll_back_merge_fn     = enbd_queue->back_merge_fn;

// JSA - Add this line because under >=2.4.1, merge optimizations are in flux
/*
 * PTB - however it's not this which does damage, I believe. Data: plugging
 *     - simply has to be enabled in these kernels. Without it, requests just
 *     - sit on the kernel queue and never come off and into our request_fn.
 * PTB - commented the ifdef again after talks with Jens Axboe.
 *     - Apparently plug_fn will disappear in 2.4.4 and merge functions are
 *       the only way to control merges, so they MUST be included.
 */

/*
 * PTB - The functions below just impose our own stricter size limit before
 *     - calling the defaults if all seems OK sizewise.
 */
	enbd_queue->merge_requests_fn = &enbd_merge_requests_fn;
	enbd_queue->front_merge_fn    = &enbd_front_merge_fn;
	enbd_queue->back_merge_fn     = &enbd_back_merge_fn;

        enbd_init_md(&enbd_md);
        enbd_init_ioctl_stub(&enbd_remote_ioctl);

        for (i = 0; i < MAX_NBD; i++) {
	        struct enbd_device *lo = &enbd_dev[i];
                enbd_reset(lo, i);
        }

        /*
         * PTB we do the disk and partition stuff after we have
         * contact, when enbd_open is called for the first time?
         */

        res = create_proc_read_entry ("nbdinfo", 0, NULL, NULL, NULL);
	if (!res) {
		ENBD_ALERT ("creation of proc entry failed\n");
		return -EINVAL;
	}
	// PTB additional write_proc entry in struct
        enbd_init_proc(res);

        // PTB make the gendisk structs very late.
        for (i = 0; i < MAX_NBD; i++) {
	        struct enbd_device *lo = &enbd_dev[i];
                enbd_set_disk(lo, i * ENBD_MAXCONN, ENBD_MAXCONN);
        }

#ifdef CONFIG_DEVFS_FS

	devfs_handle = devfs_mk_dir (NULL, "nd", NULL);
	if (devfs_handle) {
		for (i = 0; i < MAX_NBD; i++) {
			struct enbd_device *lo = &enbd_dev[i];
			int j;
			// PTB make the subdirectory "a","b" etc.
			devfs_handles[i] =
			 devfs_mk_dir (devfs_handle, lo->devnam, NULL);
			// PTB add the blk specials, "0","1" to ENBD_MAXCONN-1
			if (!devfs_handles[i])
                                continue;
			for (j = 0; j < MAX_NBD; j++) {
				char name[4];
				sprintf (name, "%u", j);
                                devfs_register(devfs_handles[i], name,
                                        DEVFS_FL_DEFAULT,
                                        major, i * ENBD_MAXCONN + j,
                                        S_IFBLK | S_IRUSR | S_IWUSR,
                                        &enbd_blkops, NULL);
                        }
			// PTB do the whole disk symlink ..
			devfs_mk_symlink (devfs_handles[i], "disk",
					  DEVFS_FL_DEFAULT, "0",
					  NULL, NULL);
			// PTB .. and the channel symlinks
			for (j = 1; j < MAX_NBD; j++) {
				char link[4];
				char name[8];
				sprintf (link, "%u", j);
				sprintf (name, "chan%u", j);
				devfs_mk_symlink (devfs_handles[i],
						  name,
						  DEVFS_FL_DEFAULT,
						  link, NULL, NULL);
			}
		}
	}
#endif		/* CONFIG_DEVFS_FS */

	// PTB - sysctl interface
	enbd_table_header = register_sysctl_table (enbd_root_table, 1);

        // PTB we have to wait for the open to complete init with inode val

	return err;
}

void __exit
enbd_cleanup (void)
{
	int i;

	for (i = 0; i < MAX_NBD; i++) {

		struct enbd_device *lo = &enbd_dev[i];
		int j;

		if (!(atomic_read (&lo->flags) & ENBD_INITIALISED))
			continue;

		ENBD_INFO ("invalidating buffers on device nd%s%d-%d\n",
			  lo->devnam, 0, ENBD_MAXCONN);

		for (j = 0; j < ENBD_MAXCONN; j++) {
			int minor = i * ENBD_MAXCONN + j;
			invalidate_buffers (mk_kdev (major, minor));
		}

		ENBD_INFO ("destroying buffers on device nd%s%d-%d\n",
			  lo->devnam, 0, ENBD_MAXCONN);

		for (j = 0; j < ENBD_MAXCONN; j++) {
			int minor = i * ENBD_MAXCONN + j;
			__invalidate_buffers (mk_kdev (major, minor), 1);
		}
	}

	unregister_sysctl_table (enbd_table_header);

#ifdef CONFIG_DEVFS_FS
	if (devfs_handle) {
		for (i = 0; i < MAX_NBD; i++) {
			int j;
			if (!devfs_handles[i])
				continue;
			for (j = 0; j < ENBD_MAXCONN; j++) {
				char s[3];
				s[0] = '0' + j;
				s[1] = 0;
				if (j >= 10) {
					s[0] = '1';
					s[1] = '0' + (j - 10);
					s[2] = 0;
				}
		                devfs_remove("nd/%s/%u", device_letter(i), j);
                                if (j == 0) {
		                        devfs_remove("nd/%s/disk", device_letter(i));
                                } else {
		                        devfs_remove("nd/%s/chan%u",device_letter(i),j);
                                }
			}
		        devfs_remove("nd/%s", device_letter(i));
		}
		devfs_remove("nd");
	}
#endif

	remove_proc_entry ("nbdinfo", &proc_root);

	for (i = 0; i < MAX_NBD; i++) {
		struct enbd_device *lo = &enbd_dev[i];
		atomic_clear_mask (ENBD_ENABLED, &lo->flags);
                if (lo->disk) {
                    del_gendisk(lo->disk);
                    put_disk(lo->disk);
                }
		if (lo->blockmap) {
			kfree (lo->blockmap);
			lo->blockmap = NULL;
		}
		enbd_sync_sync (lo);
	}

	blk_cleanup_queue (enbd_queue);

	if (unregister_blkdev (major, "nbd") != 0) {
		ENBD_ALERT ("cleanup_module failed\n");
	} else {
		ENBD_INFO ("module cleaned up.\n");
	}
        kfree(enbd_queue);
        
}

module_init (enbd_init);
module_exit (enbd_cleanup);

EXPORT_SYMBOL(enbd_remote_ioctl);

/* Compile line:

 *  gcc -O2 -D__KERNEL__ -DMODULE -DEXPORT_SYMTAB -xc -c enbd.c -o enbd.o
 *
 *  (possibly with -DMODVERSIONS also). PTB
 *  (possibly with -I/usr/src/linux-x.y.z/include also). PTB
 */
