/* 
 * Copyright (C) 1999-2001 Peter T. Breuer <ptb@it.uc3m.es>
 */


#include <stdlib.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <syslog.h>
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/param.h>
#include <fcntl.h>
#include <errno.h>
#include <netinet/in.h>
#include <asm/errno.h>
#include <linux/kdev_t.h>

#ifndef BLKGETSIZE
/* lifted from fs.h */
#define BLKGETSIZE _IO(0x12,96) 
#endif
#ifndef BLKROGET
/* lifted from fs.h */
#define BLKROGET   _IO(0x12,94) 
#endif
#ifndef BLKGETSIZE64
/* lifted from fs.h */
#define BLKGETSIZE64 _IOR(0x12,114,u64) 
#endif

# include "cliserv.h"
#ifdef HAVE_LIBPUB
#include <publib.h>
#endif

# include "file.h"
# include "select.h"

#ifndef MY_NAME
#define MY_NAME "file"
#endif

extern int debug_level;


   static inline s64 seek(int fildes, s64 offset, int whence)
   {
       DEBUG("seek to sector %Ld type %s\n", offset >> 9,
          whence==SEEK_CUR?"CUR":whence==SEEK_SET?"SET":"SEEK_END");

#ifdef HAVE_LSEEK64
       return lseek64(fildes, (loff_t) offset, whence);
#elif HAVE_LLSEEK
       return llseek(fildes, (loff_t)offset, whence);
#else
       if ((offset >> 31) != 0) {
         PERR("Cannot seek to more than 2^31-1 (0x%Lx) in one file!\n", offset);
         return -EINVAL;
       }
       return (s64) lseek(fildes, (off_t) offset, whence);
#endif
   }


   struct nbd_seek {
       int i;
       int fd;
       u64 a; // local offset
   };

   /*
    * Fill out an nbd_seek struct given the intended offset.
    * */
   static  int
   resolve_offset(struct nbd_file *self, u64 a, struct nbd_seek *s) {

      int i;
      if (a >= self->size)
        return -EINVAL;

      switch (self->mode) {
           u64 b;

           // PTB choose the fd to write to

           case NBD_LINEAR_MODE:
           for (i = 0; i < self->nfile; i++) {
             u64 size = self->sizes[i];
             if (a < size)
               break;
             a -= size;
           }
           if (i >= self->nfile) {
             return -EINVAL;
           }
           break;

           case NBD_MIRROR_MODE:
           i = 0;   // PTB silly, but what else can I say!
           break;

           case NBD_STRIPE_MODE:
           b = a % self->blksize;   // offset within block
           a/= self->blksize;       // block number
           i = a % self->nfile;     // file id holding that block number
           a/= self->nfile;         // block number within file
           a*= self->blksize;
           a+= b;             // plus offset within block
           break;

           default:
           i = -1;   // PTB to avoid compiler warnings
           break;
      }

      if (s && i >= 0 && i < self->nfile) {
        s->i = i;
        s->fd = self->exports[i];
        s->a = a;
      }
      
      return 0;
   }


   /*
    * return the fd of the component we want and also as a side effect
    * set self->lastpoint and self->lastpt[self->lasti] and
    * self->lasti. We also seek in the right file to the right point.
    * We return INVAL for a whacky seek instruction (overrange,
    * underrange).  Return BADF for a failed seek.
    */
   static int maybeseek(struct nbd_file * self, u64 a)
   {
      int i = self->lasti; 
      struct nbd_seek s;

      if (a >= self->size) {
        return -EINVAL;
      }

      DEBUG("@%Lu",a);

      if (self->lastpoint == a && !self->changedfd) {    // PTB fast path
        DEBUG(":%d/%Lu/%Lu",
          self->lasti,
          self->lastpt[self->lasti]/self->blksize,
          self->lastpt[self->lasti]%self->blksize);
        return self->exports[i]; 
      }

      if (resolve_offset(self, a, &s) < 0) {
             PERR("Cannot happen to me! Overstepped FD array\n"); 
             return -EINVAL;
      }

      i = s.i;

      if (self->lastpt[i] > 0 && self->lastpt[i] == s.a) {
                           // PTB medium fast path
          self->lasti = i;
          DEBUG(":%d/%Lu/%Lu", i, s.a/self->blksize, s.a%self->blksize);
          self->lastpoint = a;    // PTB setup fast path for next time
          return self->exports[i];
      }
                            // PTB slow path

      switch (self->mode) {

        int j;
        case NBD_STRIPE_MODE:
        case NBD_LINEAR_MODE:
          if (seek(self->exports[i], s.a, SEEK_SET) < 0) {
             DEBUG("Cannot seek locally to offset %Ld!\n", s.a);
             return -EINVAL;
          }
        break;

        case NBD_MIRROR_MODE:
          for (j = 0; j < self->nfile; j++) {
            if (seek(self->exports[j], a, SEEK_SET) < 0) {
               DEBUG("Cannot seek locally to offset %Ld!\n", s.a);
               return -EBADF;
            }
          }
        break;

      }
      self->lasti = i;
      self->lastpt[i] = s.a;

      DEBUG(":%d/%Lu/%Lu",i,s.a/self->blksize,s.a%self->blksize);
      self->lastpoint = a;    // PTB setup fast path for next time
      return self->exports[i];
   }

  /*
   * try to close the nonclosed parts of the device. Set F_CLOSED when done.
   * Unset F_OPEN in any case if partially done. Return 0 for full success,
   * otherwise something negative.
   */
   static int
   closefile(struct nbd_file * self) {
         int j;
         int errs = 0;
         int succ = 0;

 	 DEBUG("entered\n");
         self->flags &= ~F_OPENWANTED;    // PTB want close if anything
         self->flags |= F_CLOSEWANTED;    // PTB more closes required

         if (self->flags & F_CLOSED) {
             self->flags &= ~F_CLOSEWANTED;
 	     DEBUG("exited 0\n");
             return 0;
         }

         for (j = 0; j< self->nfile; j++) {
             int export = self->exports[j];
             char * exportname = self->names[j];
             self->lastpt[j] = 0;
             if (export < 0) {
                 DEBUG("skipped already closed %s\n", exportname);
                 succ++;
                 continue;
             }
             if (close(export) < 0) {
 	        PERR("Cannot close file %s\n", exportname);
                errs++;
             } else {
                 self->exports[j] = -1;
                 succ++;
                 DEBUG("closed %s\n", exportname);
             }
         }
         self->lastpoint = 0;

         if (succ > 0)
           self->flags &= ~F_OPEN;        // PTB not fully open

         if (errs <= 0) {
           self->flags &= ~F_CLOSEWANTED;
           self->flags |= F_CLOSED;       // PTB fully closed
 	   DEBUG("exited 0 and fully closed\n");
           return 0;
         }
         DEBUG("exited %d and partially closed\n", -errs);
         return -errs;
   }

   static u64
   getsize1(struct nbd_file * self, int fd) {
         u64 size = 0;

         if (size < self->blksize) {
               s64 es = 0;
               loff_t off;
               DEBUG("looking for export size with seek SEEK_END\n");
               off = seek(fd, 0, SEEK_CUR); 
               if (off >= 0) {
                 if ((es = seek(fd, 0, SEEK_END)) != -1 && es >= self->blksize){
                    size = es;
                 }
                 seek(fd,off,SEEK_SET);
               }
         }
         if (size < self->blksize) {
               off_t es = 0;
               struct stat stat_buf;
               int error;
               memset(&stat_buf, 0, sizeof(stat_buf));
               DEBUG("looking for export size with fstat\n");
               if ((error = fstat(fd, &stat_buf)) != -1
                && (es = stat_buf.st_size) >= self->blksize) { 
                  size = es;
               }
         }
         if (size < self->blksize) {
               s64 es = 0;
               DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
               if( ! ioctl(fd, BLKGETSIZE64, &es) && es >= self->blksize) {
                  size = es;
               }
         }
         if (size < self->blksize) {
               off_t es = 0;
               DEBUG("looking for export size with ioctl BLKGETSIZE\n");
               if( ! ioctl(fd, BLKGETSIZE, &es) && es >= self->blksize) {
                  size = es;
                  size <<= 9; /* assume blocksize 512 */
               }
         }
         size = ( size / self->blksize ) * self->blksize;
         return size;
  }

  static s64
  getsize(struct nbd_file * self) {
     int i = 0;
     u64 totalsize = 0;
     short unknown_size = 0;

     for (i = 0; i < self->nfile; i++) {

         u64 size = 0;
         int fd = self->exports[i];

         if (fd < 0) {
                unknown_size++;
                continue;
         }

         if (self->sizes[i] <= 0) {
                size = getsize1(self, fd);
                if (size <= 0) {
                        unknown_size++;
                        continue;
                }
         }

#ifndef HAVE_LSEEK64
#ifndef HAVE_LLSEEK
        if ((size >> 31) != 0) {
            PERR("%s is too large to export!\n", exportnames[i]);
            return -EINVAL;
        }
#endif
#endif

         self->sizes[i] = size;
         switch (self->mode) {
           case NBD_LINEAR_MODE: totalsize += size;
                   break;
           case NBD_MIRROR_MODE: totalsize = (i>0&&totalsize>0&&size<=totalsize)
                               ? : size;
                   break;
           case NBD_STRIPE_MODE: totalsize = (i<=0||totalsize<=0||size<totalsize/i)
                               ? size*(i+1) : (totalsize/i)*(i+1);
                   break;
         }
     }

     if (!unknown_size) {
        self->size = totalsize;
        return totalsize;
     }

     return -1;
  } 

#ifndef HAVE_LIBPUB
  static int
  strgsub (char *str, char *pat, char* sub, size_t max)
  {
    int instances = 0;
    int patlen = strlen(pat);
    int sublen = strlen(sub);
    int strnlen = strlen(str);
    char *s = strstr(str, pat);

    while (s) {
        // found an instance of pat
        strnlen += sublen - patlen;
        if (strnlen >= max)
            // ran out of room
            return -1;
        // move the rest of the string up in order to make room
        memmove(s+sublen, s+patlen, strnlen - ((s + sublen) - str));
        // fill in the gap with sub
        memmove(s, sub, sublen);
        instances ++;
        s = strstr(s+sublen, pat);
    }
    // ran out of pattern repetitions
    return instances;
  }

#endif

   static int nsubstr(char *str, char *pat) {
       char *s = str;
       int patlen = strlen(pat);
       int n = 0;
       if (patlen == 0)
           return 1;
       while ((s = strstr(s, pat)) != NULL) {
           s += patlen;
           n++;
       }
       return n;
   }

  static int
  rootstatblkdev(char *exportname, unsigned long *flags) {

	    // PTB block device can tell us via BLKROGET

	    int openflags = O_NONBLOCK | O_RDONLY; // just for IOCTL
	    int export;
	    int intval = -1;

#ifdef O_LARGEFILE
	    openflags |= O_LARGEFILE;
#endif
	    export = open (exportname, openflags);

	    if (export < 0) {
		PERR ("Could not open exported file %s: %m\n", exportname);
		// definitely not even openable RO async
		return -EINVAL;
	    }

	    if (ioctl (export, BLKROGET, &intval) < 0 || intval == -1) {
		PERR ("Warning, could not query %s with BLKROGET: %m\n",
		      exportname);
		PERR ("Warning, assuming %s is readwrite:\n", exportname);
		// PTB assume rw
		intval = 0;
	    }

	    close (export);

	    if (!intval) {
		// is rw
		return 0;
	    }

	    // is RO
	    MSG ("set device ro because component %s is ro\n", exportname);
	    *flags |= F_READONLY;
	    return 0;
  }

  static int
  rootstatfile(char *exportname, unsigned long *flags) {
        // PTB not a block device. Probably a file. Try real open.

	int openflags = O_RDWR;
	int export;

#ifdef O_LARGEFILE
	openflags |= O_LARGEFILE;
#endif
	export = open (exportname, openflags);

	if (export < 0) {
	        PERR ("Could not open exported file %s rw: %m\n",
		      exportname);
		// is RO
		MSG ("set device ro because component %s is ro\n",
		     exportname);
		*flags |= F_READONLY;
		return 0;
	}

	close (export);

	// PTB was root and no negative decision made. Move on.
	return 0;
  }

  /*
   * set RO bit of flag if given file is definitely readonly.
   */
  static int
  userstatfile(char *exportname, unsigned long *flags, struct stat *buf) {

    if (buf->st_mode & S_IWOTH) {
	DEBUG ("component %s is writable by everyone\n", exportname);
	return 0;
    }

    // writable by our group
    if ((buf->st_mode & S_IWGRP)
	&& buf->st_gid == getegid ()) {
	DEBUG ("component %s is writable by our group %d\n", exportname,
	       getegid ());
	return 0;
    }

    // writable by owner and owned by us
    if ((buf->st_mode & S_IWUSR)
	&& buf->st_uid == geteuid ()) {
	DEBUG ("component %s is writable by owner me %d\n", exportname,
	       geteuid ());
	return 0;
    }

    // writable by group and it's one of our groups
    if (buf->st_mode & S_IWGRP) {
	gid_t grps[NGROUPS];
        int ngrps;
        int j;

	ngrps = getgroups (NGROUPS, grps);

	for (j = 0; j < ngrps; j++) {
	    if (buf->st_gid == grps[j]) {
		DEBUG ("component %s is writable by my group %d\n",
		       exportname, grps[j]);
		break;
	    }
	}
	if (j < ngrps && ngrps > 0) {
	    DEBUG ("component %s is writable by my group %d\n",
		   exportname, grps[j]);
	    return 0;
	}
    }

    return 0;

  }

  /*
   * set RO bit of flag if given block device is definitely readonly.
   */
  static int
  userstatblkdev(char *exportname, unsigned long *flags) {

        // Block device

	int openflags = O_NONBLOCK | O_RDONLY;
	int export;
	int intval = -1;

#ifdef O_LARGEFILE
	openflags |= O_LARGEFILE;
#endif
	export = open (exportname, openflags);	// for an ioctl

	if (export < 0) {
	    PERR ("Could not open exported file %s: %m\n", exportname);
	    return -EINVAL;
	}

	if (ioctl (export, BLKROGET, &intval) < 0 || intval == -1) {
	    DEBUG ("Could not query exported file %s: %m\n", exportname);
            // assume r/w
            intval = 0;
	}

	close (export);

        if (intval)
            *flags |= F_READONLY;

        return 0;
  }

  /*
   * set RO bit of flag if given file or blkdev is definitely readonly.
   */
  static int
  rostat1(char *exportname, unsigned long *flags) {

      // really check!

	int openflags = O_NONBLOCK | O_RDONLY;
        int export;

#ifdef O_LARGEFILE
	openflags |= O_LARGEFILE;
#endif
	export = open (exportname, openflags);
	if (export < 0) {
	    DEBUG ("Could not open exported file %s rw: %m\n", exportname);
	    return -EINVAL;
	}
	close (export);

        // PTB we know it opens RO, does it open RW?
	openflags = O_NONBLOCK | O_RDWR;
#ifdef O_LARGEFILE
	openflags |= O_LARGEFILE;
#endif

	export = open (exportname, openflags);
	if (export < 0) {
	    // is RO
	    MSG ("set device ro because component %s is ro\n", exportname);
	    *flags |= F_READONLY;
	    return 0;
	}
	close (export);

        return 0;
  }


  /*
   * set RO bit of flag if given device or file is definitely readonly.
   */
  static int
  userstat1(char *exportname, unsigned long *flags, struct stat *buf) {
    // PTB ordinary people have to look at perms
    int errs;

    if (S_ISBLK (buf->st_mode)) {
        errs = userstatblkdev(exportname, flags);
    } else {
        errs = userstatfile(exportname, flags, buf);
    }
    return errs;
  }

  /*
   * set RO bit of flag if given device or file is definitely readonly.
   */
  static int
  rootstat1(char *exportname, unsigned long *flags, struct stat *buf) {

        int errs;

	if (S_ISBLK (buf->st_mode)) {
            errs = rootstatblkdev(exportname, flags);
        } else {
            errs = rootstatfile(exportname, flags);
        }
        return errs;
  }
 
  /*
   * set RO bit of flag if given device or file is definitely readonly.
   */
  static int
  statfile1 (char *exportname, unsigned long *flags)
  {

    struct stat buf;
    int errs;

    if (stat (exportname, &buf) < 0) {
	PERR ("Could not stat exported file %s: %m\n", exportname);
	return -EINVAL;
    }

    if (S_ISBLK (buf.st_mode) && MAJOR (buf.st_rdev) == 2
	&& !(*flags & F_ASYNC)) {
	// is floppy, make us sync
	*flags |= F_SYNC;
	MSG ("set sync flags on %s because it's a floppy\n", exportname);
    }

    // PTB ro checks

    if (geteuid () == 0)  {
        errs = rootstat1(exportname, flags, &buf);
    } else {
        errs = userstat1(exportname, flags, &buf);
    }

    // PTB final ro check
    //
    if (errs >= 0 && ! (*flags & F_READONLY)) {
        errs = rostat1(exportname, flags);
    }

    return errs;
  }

  /*
   * Replace %s in filename by client address. The filename has
   * already been expanded to become absolute by this point (not
   * that that matters for the purposes of this routine)
   */
  static char *
  expandname (char *exportname, unsigned long caddr)
  {
    char *s;
    int maxlen;
    char s_caddr[16];
    int n;

    if (caddr == 0) {
	// PTB we need to substitute and we can't.
	PERR ("no information yet to expand filename %s: %m\n", exportname);
	return NULL;
    }

    n = nsubstr(exportname, "%s");
    maxlen = strlen (exportname) + 13 * n;
    s = malloc (maxlen + 1);
    if (!s) {
	PERR ("no memory to expand filename %s: %m\n", exportname);
	return NULL;
    }

    strcpy (s, exportname);
    sprintf (s_caddr, "%ld.%ld.%ld.%ld",
	     255 & (caddr >> 24), 255 & (caddr >> 16),
	     255 & (caddr >> 8), 255 & (caddr >> 0)
     );
    if (strgsub (s, "%s", s_caddr, maxlen + 1) >= 0) {
        return s;
    }
    return NULL;
  }

  /*
   * check the resources properties and set our flags accordingly.
   * Set SYNC if it's a floppy. Set  RO if we don't have write perms
   * (or if it's a cdrom?).
   *
   * and now, do file name expansion if it's necessary!
   */
  static void
  statfile (struct nbd_file *self) {

    int i;
    int errs = 0;
    char *s;

    for (i = 0; i < self->nfile; i++) {

        // PTB look for instances of %s in the resource name
        char * p = strstr(self->names[i], "%s");

        if (p != NULL) {

            if ((s = expandname (self->names[i], *self->caddr)) != NULL) {
                self->names[i] = s;
            } else {
                errs++;
                // PTB can't test if we don't know its name
                continue;
            }
        }

        if (statfile1(self->names[i], &self->flags) < 0) {
            errs++;
            continue;
        }

    }

  }

  /*
   * try to open the nonopen parts of te device. Set F_OPEN when done.
   * Unset F_CLOSED if partially done. Return 0 for success, otherwise
   * something negative.
   */
  static int
  openfile(struct nbd_file * self) {

       int i;
       int errs = 0;
       int succ = 0;
       int openflags = 0;

       DEBUG("entered\n");
       if (self->flags & F_OPEN) {
           DEBUG("already open, exited %d\n", 0);
           return 0;
       }
       self->flags |= F_OPENWANTED;    // not fully open
       self->flags &= ~F_CLOSEWANTED;  // not closed


       // PTB first time through, check the file types and perms
       
       if (! (self->flags & F_STAT)) {
             // set ro/sync flags if we think we should
             unsigned long flags = self->flags;
             statfile(self);
             if (flags != self->flags) {
                 MSG("file flags changed from %#lx to %#lx\n", flags, self->flags);
             }
             // PTB we scanned once, don't do it again!
             self->flags |= F_STAT;
       }

       // now determine how we want to open
       openflags |= (self->flags&F_READONLY)? O_RDONLY : O_RDWR;
       if (self->flags &F_SYNC)
           openflags |= O_SYNC;

       #ifdef O_LARGEFILE
           openflags |= O_LARGEFILE;
       #endif

       #ifdef O_DIRECT
       if (self->flags & F_DIRECT)
           openflags |= O_DIRECT;
       #endif

       for (i = 0; i < self->nfile; i++) {

         char * exportname = self->names[i];
         int export = self->exports[i];

         if (export >= 0) {
             DEBUG("Skipped already open %s fd %d\n", exportname, export);
             succ++;
             continue;
         }

         export = open(exportname, openflags);

         if (export < 0) {
            DEBUG("Could not open exported file %s: %m\n", exportname);
            errs++;
            continue;
         }

         self->exports[i] = export;
         DEBUG("opened %s fd %d\n", exportname, export);
         // PTB checking
         if (1) {
           char c[8192], *d = c;
           // PTB align at 4096 for raw devices
           d = (char*)((((unsigned long)d + 4095) >> 12) << 12);
           if (read(export, d, 512) < 512) {
             DEBUG("Could not read sector 0 of file %s fd %d: %m\n",
               exportname, export);
             close(export);
             errs++;
             continue;
           }
           DEBUG("Successfully read sector 0 of file %s fd %d\n",
               exportname, export);
         }

         // PTB success
         seek(export,0,SEEK_SET);
         self->lastpt[i] = 0;
         succ++;
       }
 
       self->lastpoint = 0;
       if (self->size <= 0) {
          getsize(self);    // PTB changes SEEK point
          // don't know if that succeeded. Doesn't matter if not.
       }

       if (errs <= 0) {
         self->flags |= F_OPEN;          // fully open
         self->flags &= ~F_OPENWANTED;   // not fully closed
         self->lives++;                  // generation count
         DEBUG("opened file %0x for %dth time\n", (unsigned)self, self->lives);
       } 

       if (succ > 0) {
         self->flags &= ~F_CLOSED;       // not fully closed
       } 

       DEBUG("exited %d\n", -errs);
       return -errs;
  }

/*
 * only open if not open and retry set
 */
   static int
   maybeopen (struct nbd_file *self) {

       if (self->flags & F_OPEN)
	   return 0;

//       if (!(self->flags & F_OPENWANTED)) {
//	   DEBUG ("open not wanted\n");
//	   return -ENODEV;
//       }
       return self->open (self);
   }


  /*
   * Send ioctl to ALL components of file object. Silly! Probably we
   * should refuse if more than one component, or somehow just choose
   * one. FIXME
   */
  static int ioctlfile(struct nbd_file *self, int nioctl, char *arg) {

       int i, errs = 0, res, succ = 0;

       if (maybeopen(self) < 0) {
         self->flags |= F_OPENWANTED;
         return -ENODEV;
       }

       for (i = 0; i < self->nfile; i++) {

         char * exportname = self->names[i];
         int export = self->exports[i];
         long flags;

         exportname = exportname;

         if (export < 0) {
             DEBUG("Skipped closed %s fd %d\n", exportname, export);
             errs++;
             continue;
         }

         /*
          * To do some ioctls we need to have opened NONBLOCK, but using fcntl
          * to add the flag now does not seem to be enough, so we actualy
          * reopen NONBLOCK here and then fcntl the flag back off again later
          */

         flags = fcntl(export, F_GETFL);
         if (flags != -1l && !(flags & O_NONBLOCK)) {
            close(export);
            export = open(exportname, flags|O_NONBLOCK);
            self->exports[i] = export;
            if (export < 0) {
                self->flags &= ~F_OPEN;
                self->flags |= F_OPENWANTED;
                errs++;
                continue;
            }
         }

         res = ioctl(export, nioctl, arg);

         if (flags != -1l && !(flags & O_NONBLOCK)) {
            if (fcntl(export, F_SETFL, flags) == -1l) {
                close(export);
                self->exports[i] = -1;
                self->flags &= ~F_OPEN;
                self->flags |= F_OPENWANTED;
            }
         }
         if (res < 0) {
            DEBUG("Could not ioctl exported file %s: %m\n", exportname);
            errs++;
            continue;
         }
         succ++;
       }

       return -errs;
  }

  /*
   * Seek to required point in file object
   *
   */
  static s64 seekfile(struct nbd_file *self, s64 offset, int whence) {

      u64 a;

      if (!(self->flags & F_OPEN)) {
        if (!(self->flags & F_OPENWANTED))
                return -ENODEV;
        if (self->open(self) < 0) {
              return -ENODEV;
        }
      }

      switch(whence) {

        case SEEK_SET:
           a = offset;
           break;

        case SEEK_CUR:
           a = self->lastpoint;
           break;

        case SEEK_END:
           a = self->size + offset;
           break;

        default:
           return -EINVAL;
      }

      if (maybeseek(self, a) < 0) {
            // PTB close the device if we had a bad seek and RETRY set
        if (self->flags & F_OPEN) {
          self->close(self);
          self->flags |= F_OPENWANTED;
        }
        return -EBADF;
      }
      return a;
  }
  

#if DEBUG_DFLT > 0
  /*
   * Tell us where we are right now
   * */
  static s64
  whereami(struct nbd_file *self, int fd, s64 start, int whence) {

      int i;

      if (whence == SEEK_SET)
          return start;

      for (i = 0; i < self->nfile; i++) {
          if (self->exports[i] == fd)
              break;
      }
      if (i >= self->nfile)
          return -1L;

      switch (whence) {
          case SEEK_CUR:
              return start + self->lastpt[i];
          case SEEK_END:
              return start + self->sizes[i];
      }

      return -1L;
  }
#endif


  /*
   * Reliably send/recv data to/from file object
   *
   */
  static inline int
  fileit (struct nbd_file *self, int direction, int fd, void *buf, unsigned len) {

      int res;
      unsigned count = 0;

      while (len > 0) {
         switch (direction) {

         case READ: // 0
           DEBUG("*[%u@%Lu]", len, self->lastpoint);

           res = read(fd, buf, len);

           if (res <= 0) {
               #if DEBUG_DFLT > 0
               s64 off = whereami(self, fd, 0L, SEEK_CUR);
               #endif
               DEBUG("Read len %d from %Ld failed on fd %d: %m\n", len,off,fd);
               count = (res<0) ? res : -EINVAL; 
               goto end;
           }
           DEBUG("Read %d bytes from fd %d in readfile\n", res, fd);
           break;

         default:
         case WRITE: // 1
           DEBUG("+[%u@%Lu]", len, self->lastpoint);

           res = write(fd, buf, len);

           if (res <= 0) {
               #if DEBUG_DFLT > 0
               s64 off = whereami(self, fd, 0L, SEEK_CUR);
               #endif
               DEBUG("Write len %d at %Ld failed on fd %d: %m\n", len, off, fd);
               count = (res<0) ? res : -EINVAL; 
               goto end;
           }
           DEBUG("Wrote %d bytes to fd %d in readfile\n", res, fd);
           if (self->flags & F_SYNC) {
               fsync(fd);
           }
           break;
         }
         len   -= res;
         buf   += res;
         count += res;
      }

end:
      return count;
   }


  static int
  unlockfile(struct nbd_file * self, u64 from, s64 len) {
    int err;
    struct flock lock = {
        l_type:   F_UNLCK,
        l_whence: SEEK_END,
        l_start:  from,
        l_len:    len,
        l_pid:    self->pid,
    };
    int fd;
    struct nbd_seek s;

    // maybe we should go through that open palavar!
      
    if (maybeopen(self) < 0) {
          self->flags |= F_OPENWANTED;
          return -ENODEV;
    }

    if (resolve_offset(self, self->size - 1, &s) < 0)
          return -EINVAL;

    if ((fd = s.fd) < 0)
          return -EINVAL;
   
    err = fcntl (fd, F_SETLKW, &lock);
    DEBUG("   lock %Ld-%Ld ->\n", from, from+len);
    return err;
  }

  static int
  lockfile(struct nbd_file * self, u64 from, s64 len, int type) {

      // PTB we lock a virtual area beyond the last file component

      struct flock lock = {
        l_type: type,
        l_whence: SEEK_END,
        l_start: from,
        l_len: len,
        l_pid: self->pid,
      };
      const unsigned long udelay = 100000;
      int count = 30;
      struct nbd_seek s;
      int fd;

      // maybe we should go through that open palavar!
      
      if (maybeopen(self) < 0) {
          self->flags |= F_OPENWANTED;
          return -ENODEV;
      }

      if (resolve_offset(self, self->size - 1, &s) < 0)
          return -EINVAL;

      if ((fd = s.fd) < 0)
          return -EINVAL;
   
      // PTB spin for lock
      while (fcntl(fd, F_SETLKW, &lock) < 0) {
          int err = errno;

          switch (err) {

              case EINTR:
              case EACCES:
              case EAGAIN:
                  break;
              case EDEADLK:
                  if (--count < 0) {
                      DEBUG("resource %Ld-%Ld is already %slocked\n",
                              from, from+len, type == F_WRLCK ? "w" : "r");
                      return -err;
                  }
                  break;
              default:
                return -err;
          }

          microsleep(udelay);
          lock.l_type = type;
          lock.l_whence = SEEK_END;
          lock.l_start = from;
          lock.l_len = len;
          lock.l_pid = self->pid;
      }
      DEBUG("-> %slock %Ld-%Ld\n", type == F_WRLCK ? "w" : "r", from, from+len);
      return 0;
  }


/*
 * only close if not closed and retry set
 */
   static int
   maybeclose (struct nbd_file * self) {

       if (self->flags & F_CLOSED)
	   return 0;

       return self->close(self);
   }

   static int
   maybereopen (struct nbd_file *self) {
       maybeclose(self);
       self->flags |= F_OPENWANTED;
       return maybeopen(self);
   }


   static int
   opfile(struct nbd_file * self, int direction, char * buf, u64 foffset, unsigned len) {

            // split into sections of at most blocksize

            int boffset = 0;  // current offset within buf
            int tot     = 0;  // total bytes transfered so far
            const int retries
                        = 2;  // number of tries at an op allowed
            int tries   = 0;  // number of tries so far
            short didlock = 0;
            unsigned locklen = len;
            u64      lockoff = foffset;

            DEBUG("opfile enters for type %d len %d from %Ld\n",
                  direction, len, foffset);

            if (maybeopen(self) < 0) {
              self->flags |= F_OPENWANTED;
              return -ENODEV;
            }

            if ( self->flags & F_DOLOCK) {
                int res;
                int l_type;
                switch(direction) {
                        case READ:
                        l_type = F_RDLCK;
                        break;

                        case WRITE:
                        l_type = F_WRLCK;
                        break;

                        default:
                        tot = -EINVAL;
                        goto fail;
                        break;
                }
                res = lockfile(self, foffset, len, l_type);
                if (res < 0)  {
                        tot = res;
                        goto fail;
                }
                didlock = 1;
            }


            while (len > 0) {

              int blockoffset = foffset % self->blksize; // offset within block
              unsigned sublen = 0;                 // transfer this much
              int export = -1;
              int count  = -1;
             
              // PTB cut sublen down to go up to block boundary only

              if ( len + blockoffset > self->blksize )
                sublen = self->blksize - blockoffset;
              else
                sublen = len;

           try_seek:

              // PTB this sets self->lasti, self->lastpoint, self->lastpt[lasti]
              export   = maybeseek(self, foffset);
              DEBUG("found offset %Ld in fd %d\n", foffset, export);

              // PTB returning INVAL this early is problematic, since we
              // may need to return "tot = 0" to properly indicate the
              // failure  type (local device).
              //   And when RETRY is set we
              // want to go through the rigmarole of closing the
              // device. But at this point I can't tell whether the
              // instruction is wrong (overlong, underzero) or the
              // device really failed.
              //  Maybe it needs maybeseek to return
              // the device index and relative offset, but not do the
              // seek, and we do the seek here? Or 

              if (export < 0) {
                if (tries++ < retries) {
                  if (maybereopen(self) >= 0)
                    goto try_seek;
                }
                maybeclose(self);
                self->flags |= F_OPENWANTED;
                tot = -EINVAL;
                goto fail;
              }

            try_fileit:

              switch(self->mode) {
              default:
              // NBD_LINEAR_MODE
              // NBD_STRIPE_MODE
                count = fileit(self, direction, export, buf + boffset, sublen);
                break;
              case NBD_MIRROR_MODE:
                switch(direction) {
                int j; 
                fd_set fds, * rfds;
                int fmax;

                case READ: // 0

                  rfds = & fds; 
                  fmax = 0;
                  FD_ZERO(rfds);
                  for (j = 0; j < self->nfile; j++) {
                    int fd = self->exports[j];
                    fmax = fmax < fd ? fd : fmax;
                    FD_SET(fd, rfds);
                  }

                  if (select(fmax+1,rfds,NULL,NULL,NULL) <= 0)
                    break;

                  for (j = 0; j < self->nfile; j++) {
                    int fd = self->exports[j];
                    if (FD_ISSET(fd, rfds)){
                      export = fd;
                      break;
                    }
                  }

                  count = fileit(self, direction, export, buf + boffset, sublen);
                  break;

                case WRITE: // 1
                  for (j = 0; j < self->nfile; j++) {
                    count = fileit(self, direction, self->exports[j], buf+boffset, sublen);
                    if (count <= 0)
                      break;
                  }
                  break;
                }
                break;
              }

              if (count <= 0) {
                if (tries++ < retries) 
                  goto try_fileit;
                goto fail;
              }

              self->lastpoint += count;
              self->changedfd = 0;
              self->lastpt[self->lasti] += count;

              // PTB check if we can use the same seek point next time

              switch(self->mode) {
              case NBD_LINEAR_MODE:
                if (self->lastpt[self->lasti] >= self->sizes[self->lasti]) {
                  self->changedfd = 1;
                }
                break;
              case NBD_MIRROR_MODE:
                break;
              case NBD_STRIPE_MODE:
                if (count + blockoffset >= self->blksize) {
                  self->changedfd = 1;
                }
                break;
              }

              foffset += count;
              boffset += count;
              len     -= count;
              tot     += count;
            }

            if (tot < len) {
              // PTB we never get here. The fail was trapped earlier
              PERR("short fileop, %d instead of %d bytes\n", tot, len);
              goto fail;
            }

            if (didlock)
                unlockfile(self, lockoff, locklen);
            DEBUG("opfile exits with result %d for type %d len %d from %Ld\n",
                  tot, direction, locklen, lockoff);
            return tot;

         fail:

            if (didlock)
                unlockfile(self, lockoff, locklen);

            maybeclose(self);
            self->flags |= F_OPENWANTED;

            DEBUG("opfile exits with result %d for type %d len %d from %Ld\n",
                  tot, direction, len, foffset);
            return tot;
    }

   static int
   readfile(struct nbd_file * self, char * buf, unsigned len, u64 foffset) {
           int res = opfile(self, 0,buf,foffset,len);
           DEBUG("readfile totalled %d bytes from %Ld\n", res, foffset);
           return res;
   }

   static int
   writefile(struct nbd_file * self, char * buf, unsigned len, u64 foffset) {
           int res = opfile(self, 1,buf,foffset,len);
           DEBUG("writefile totalled %d bytes from %Ld\n", res, foffset);
           return res;
   }

   static int
   syncfile(struct nbd_file * self) {
           int i, err = 0;
           for (i = 0; i < self->nfile; i++) {
             if (self->exports[i] >=0)
               err += fsync(self->exports[i]);
           }
           return err;
   }


   static int
   reopenfile(struct nbd_file * self) {

         int errs;

         if (! (self->flags & F_CLOSED)) {
           errs = self->close(self);
           if (errs < 0)
             return errs;
         }

         errs = self->open(self);

         return errs;
   }


  int init_file(struct nbd_file * self, int nfile, int mode, s64 *sizes,
          int blksize, char* names[], int ro, int sync, int async, int lock,
          int direct, unsigned long *caddr) {

       int i;
       short unknown_size;

       self->nfile = nfile;
       self->mode  = mode ;
       self->blksize = blksize ;
       self->caddr = caddr; // this is an alias back to the real caddr

       self->size  = 0;
       self->flags = 0;

       self->lastpoint = 0;
       self->changedfd = 0;

       self->lives = 0;

       unknown_size = 0;
       for (i = 0; i < nfile; i++) {
          self->names[i]     = names[i];
          self->exports[i]   = -1;
          if (sizes[i] > 0) {
              self->size += self->sizes[i] = sizes[i];
          } else {
              self->sizes[i] = 0;
              unknown_size = 1;
          }
       }
       // if one component size was not given, then we don't know total
       if (unknown_size)
           self->size = 0;

       if (ro) 
         self->flags |= F_READONLY;
       if (sync && ! async)
         self->flags |= F_SYNC;
       if (async && ! sync)
         self->flags |= F_ASYNC;
       if (lock) 
         self->flags |= F_DOLOCK;
       if (direct) 
         self->flags |= F_DIRECT;

       self->write = writefile;
       self->read  = readfile;
       self->sync  = syncfile;
       self->reopen= reopenfile;
       self->open  = openfile;
       self->seek  = seekfile;
       self->ioctl = ioctlfile;
       self->close = closefile; // PTB !!! may cause problems 
       self->lock  = lockfile;
       self->unlock= unlockfile;
       self->flags |= F_CLOSED;

       return 0;
  }

