/* Time-stamp: <2006-04-17 19:06:02 poser>
 *
 * Convert text containing various 7-bit ASCII escapes to UTF-7 Unicode.
 *
 * Copyright (C) 2005-2006 William J. Poser (billposer@alum.mit.edu)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 * or go to the web page:  http://www.gnu.org/licenses/gpl.txt.
 */

#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#ifdef HAVE_LIBINTL_H
#include <libintl.h>
#define _(String) gettext(String)
#else
#define _(x) (x)
#endif
#include "unicode.h"
#include "enttbl.h"
#include "exitcode.h"
#if defined(__DATE__) && defined(__TIME__)
#define HAVE_DATE_TIME
char compdate[]= "Compiled " __DATE__ " " __TIME__ ;
#else
char compdate[]= "";
#endif

char version[]=PACKAGE_VERSION;
char pgname[]="ascii2uni";

#define LBUFSIZE 2048

void
ShowVersion(void){
  fprintf(stderr,"\n%s  %s\n",pgname,version);
#ifdef HAVE_DATE_TIME
  fprintf(stderr,"%s\n",compdate);
#endif
  fprintf(stderr,"Copyright (C) 2005-2006 William J. Poser\n");
  fprintf(stderr,_("Released under the terms of the GNU General Public License.\n\n"));
}

void
ShowUsage(void){
  fprintf(stderr,_("This program is a filter which converts 7-bit ASCII text\n\
containing various representations for non-ASCII characters\nto UTF-8 Unicode.\n"));
  fprintf(stderr,_("Usage: %s [flags]\n"),pgname);
  fprintf(stderr,_("       -h Print this usage message.\n"));
  fprintf(stderr,_("       -q Quiet - don't chat.\n"));
  fprintf(stderr,_("       -v Print version information.\n"));
  fprintf(stderr,_("       -p Pure. The input consists of escapes separated by whitespace.\n"));
  fprintf(stderr,_("   Give at most one of the following conversion specifications:\n"));
  fprintf(stderr,
	  _("       -A Convert hexadecimal numbers with prefix U in angle-brackets(<U00E9>)\n"));
  fprintf(stderr,
	  _("       -B Convert \\x-escaped hexadecimal numbers (\\x00E9)\n"));
  fprintf(stderr,
	  _("       -C Convert \\x-escaped hexadecimal numbers in braces (\\x{00E9})\n"));
  fprintf(stderr,
	  _("       -D Convert decimal HTML numeric character references (&#0233;)\n"));
  fprintf(stderr,
	  _("       -E Convert hexadecimal with prefix U (U00E9)\n"));
  fprintf(stderr,
	  _("       -F Convert hexadecimal with prefix u (u00E9)\n"));
  fprintf(stderr,
	  _("       -G Convert hexadecimal in single quotes with prefix X (X\'00E9\')\n"));
  fprintf(stderr,
	  _("       -H Convert hexadecimal HTML numeric character references (&#x00E9;)\n"));
  fprintf(stderr,
	  _("       -I Convert hexadecimal UTF-8 with each byte's hex preceded by an =-sign (=C3=A9)\n\t\tThis is the URI escape format defined by RFC 2396.\n"));
  fprintf(stderr,
	  _("       -J Convert hexadecimal UTF-8 with each byte's hex preceded by a %%-sign  (%%C3%%A9).\n\t\tThis is the Quoted Printable format defined by RFC 2045.\n"));
  fprintf(stderr,
	  _("       -K Convert octal UTF-8 with each byte escaped by a backslash (\\303\\251)\n"));
  fprintf(stderr,
	  _("       -L Convert \\u-escaped hex (\\u00E9) within the BMP (U+0000-U+FFFF),\n\t\t\\U-escaped hex (\\U00010024) outisde it.\n"));
  fprintf(stderr,
	  _("       -M Convert hexadecimal SGML numeric character references (\\#x00E9;)\n"));
  fprintf(stderr,
	  _("       -N Convert decimal SGML numeric character references (\\#0233;)\n"));
  fprintf(stderr,
	  _("       -O Convert octal escapes for the three low bytes in big-endian order (\\000\\000\\351)\n"));
  fprintf(stderr,
	  _("       -P Convert hexadecimal numbers with prefix U+ (U+00E9)\n"));
  fprintf(stderr,
	  _("       -Q Convert HTML character entities (&eacute;)\n"));
  fprintf(stderr,
	  _("       -R Convert raw hexadecimal numbers (00E9)\n"));
  fprintf(stderr,
	  _("       -S Convert hexadecimal escapes for the three low bytes in big-endian order (\\x00\\x00\\xE9)\n"));
  fprintf(stderr,
	  _("       -T Convert decimal escapes for the three low bytes in big-endian order (\\d000\\d000\\d233)\n"));
  fprintf(stderr,
	  _("       -U Convert \\u-escaped hex (\\u00E9)\n"));
  fprintf(stderr,
	  _("       -V Convert \\u-escaped decimal (\\u0233)\n"));
  fprintf(stderr,
	  _("       -X Convert standard form hexadecimal numbers (0x00E9)\n"));
  fprintf(stderr,
	  _("       -Y Convert all three HTML escape types:\n\t\thexadecimal numeric, decimal numeric, and character entity.\n"));
  fprintf(stderr, 
	  _("       -Z <format> Convert input using the supplied format.\n"));
  fprintf(stderr,_("Report bugs to: billposer@alum.mit.edu\n"));
}

static char lbuf [LBUFSIZE+1];


/* The length of the longest character entity */
#define MAXENTLEN 8

int main (int ac, char *av[])
{
  char *SplitFormat = "\\%1[uU]%X%n"; /* This is for BMPSplit */

  char *Afmt = "<U%lX>";
  char *Bfmt = "\\x%lX";
  char *Cfmt = "\\x{%lX}";
  char *Dfmt = "&#%ld;"; 
  char *Efmt = "U%lX";
  char *Ffmt = "u%lX";
  char *Gfmt = "X\'%lX\'";
  char *Hfmt = "&#x%lX;"; 
  char *Ifmt = "=%2lX"; 		/* UTF-8 */
  char *Jfmt = "%%%2lX"; 		/* UTF-8 */
  char *Kfmt = "\\%3lo"; 		/* UTF-8 */
  char *Mfmt = "\\#x%lX;"; 
  char *Nfmt = "\\#%ld;";
  char *Ofmt = "\\%03o\\%03o\\%03o";
  char *Pfmt = "U+%lX";
  char *Qfmt = "&%[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789];";
  char *Rfmt = "%lX"; 
  char *Sfmt = "\\x%02x\\x%02x\\x%02x";	
  char *Tfmt = "\\d%03d\\d%03d\\d%03d";
  char *Ufmt = "\\u%8lX";
  char *Vfmt = "\\u%8ld";
  char *Xfmt = "0x%lX";

  char *fmt = Xfmt;		/* Default is plain hex format */
  char afmt [67+2+1+2];
  char aHfmt [8+2+1];
  char aDfmt [8+2+1];
  char cbuf[5];

  UTF32 num;
  int oc;			/* Command line option flag */
  int Converted;
  long TokenNumber;
  long ReplacedNumber;
  int BMPSplit = 0; 
  int VerboseP = 1;
  int UTF8ValueP = 0;		/* Are incoming values UTF-8? */
  int AllHTMLP = 0; 		/* Translate all three kinds of HTML escape */
  int PureP = 0;
  int Word_Length;
  int NConsumed;
  int LineNo;
  char *str;
  char *iptr;
  int eof;
  char SplitStr[3];
  char enam[LBUFSIZE];
  long MicrosoftStyle = 0L;
  unsigned char b1;		/* Used for byte-wise encoding */
  unsigned char b2;
  unsigned char b3;

  extern int optind;
  extern int opterr;
  extern int optopt;
  extern void putu8 (unsigned long);
  extern char * Get_Word(FILE *, int *, int *);

  opterr = 0;

#ifdef HAVE_SETLOCALE
   setlocale(LC_ALL,"");
#endif
#ifdef HAVE_LIBINTL_H
   bindtextdomain (PACKAGE,LOCALEDIR);
   textdomain (PACKAGE);
#endif

  /* Handle command line arguments */

   while( (oc = getopt(ac,av,"ABCDEFHIJKLMNOPQRSTUVXYZ:hpqv")) != EOF){
     switch(oc){
     case 'A':
       fmt = Afmt;
       break;
     case 'X':
       fmt = Xfmt;
       break;
     case 'O':
       fmt = Ofmt;
       break;
     case 'S':
       fmt = Sfmt;
       break;
     case 'T':
       fmt = Tfmt;
       break;
     case 'P':
       fmt = Pfmt;
       break;
     case 'Q':
       fmt = Qfmt;
       break;
     case 'R':
       fmt = Rfmt;
       break;
     case 'B':
       fmt = Bfmt;
       break;
     case 'C':
       fmt = Cfmt;
       break;
     case 'D':
       fmt = Dfmt;
       break;
     case 'E':
       fmt = Efmt;
       break;
     case 'F':
       fmt = Ffmt;
       break;
     case 'G':
       fmt = Gfmt;
       break;
     case 'H':
       fmt = Hfmt;
       break;
     case 'I':
       fmt = Ifmt;
       UTF8ValueP = 1;
       break;
     case 'J':
       fmt = Jfmt;
       UTF8ValueP = 1;
       cbuf[0] = '0';
       cbuf[1] = 'x';
       break;
     case 'K':
       fmt = Kfmt;
       cbuf[0] = '\\';
       UTF8ValueP = 1;
       break;
     case 'L':
       fmt = SplitFormat;
       BMPSplit =1;
       break;
     case 'M':
       fmt = Mfmt;
       break;
     case 'N':
       fmt = Nfmt;
       break;
     case 'U':
       fmt = Ufmt;
       break;
     case 'V':
       fmt = Vfmt;
       break;
     case 'Y':
       fmt = Qfmt;
       AllHTMLP = 1;
       break;
     case 'Z':
       fmt = optarg;
       break;
     case 'p':
       PureP = 1;
       break;
     case 'q':
       VerboseP = 0;
       break;
     case 'h':
       ShowUsage();
       exit(INFO);
       break; 			/* NOTREACHED */
     case 'v':
       ShowVersion();
       exit(INFO);
       break; 			/* NOTREACHED */
     case ':':
       fprintf(stderr,_("%s: missing argument to option flag %c.\n"),pgname,optopt);
       exit(BADOPTIONARG);
     default:
       fprintf(stderr,_("%1$s: invalid option flag %2$c\n"),pgname,optopt);
       ShowVersion();
       ShowUsage();
       exit(INFO);
     }
   }

   if( (fmt == Rfmt) && (!PureP) ) {
     fprintf(stderr,_("It isn't possible to parse raw hex unicode out of ASCII text.\n"));
     exit(BADOPTION);
   }

   if(AllHTMLP && PureP) {
     fprintf(stderr,_("Conversion of all three HTMl formats is not supported in pure mode.\n"));
     exit(BADOPTION);
   }

   if(AllHTMLP) {
     sprintf(aDfmt,"%s%%n",Dfmt);
     sprintf(aHfmt,"%s%%n",Hfmt);
   }

   sprintf(afmt,"%s%%n",fmt);	/* Add %n for NConsumed */
   ReplacedNumber = 0L;
   TokenNumber = 0L;
   /*
    * This is the case in which the input consists entirely of escapes
    * except for arbitrary (but non-null) amounts of intervening whitespace.
    */

   if(PureP) {
     while(1){
       str = Get_Word(stdin,&Word_Length,&eof);
       if(eof) break; 
       if(Word_Length == 0) continue;
       TokenNumber++;
       if(str == NULL){
	 fprintf(stderr,_("%1$s: failed to allocate storage for input token %2$ld.\n"),
		 pgname,TokenNumber);
	 exit(OUTOFMEMORY);
       }
       if(fmt == Qfmt) {
	 Converted = sscanf(str,afmt,&enam,&NConsumed);
	 num = LookupCodeForEntity(enam);
	 if(!num) {
	   num = UNI_REPLACEMENT_CHAR;
	   fprintf(stderr,"ascii2uni: unknown HTML character entity \"&%s;\"\n",
		   enam);
	   ReplacedNumber++;
	   Converted = (-1);
	 }
	 else Converted = 1;
       }
       else if( (Ofmt == fmt) || (Sfmt == fmt) || (Tfmt == fmt)) {
	 Converted = sscanf(str,afmt,&b1,&b2,&b3,&NConsumed);
	 switch(Converted)
	   {
	   case 3:
	     num = (((b1 * 256) + b2) * 256) + b3;
	     break;
	   case 2:
	     num = (b1 * 256) + b2;
	     break;
	   case 1:
	     num = b1;
	     break;
	   default:
	     break;
	     /* This case is handled below */
	 }
       }
       else {
	 Converted = sscanf(str,afmt,&num,&NConsumed);
       }

       if(Converted < 1) {
	 fprintf(stderr,_("Ill-formed input %1$s at token %2$lu\n"),str,TokenNumber);
	 exit(BADRECORD); 
       }
       else if(Converted > 3) {
	 fprintf(stderr,_("The character encoded as %1$s at token %2$lu is outside the Unicode range.\n\tEmitting Unicode replacement character.\n"),
		 str,TokenNumber);
	 putu8(UNI_REPLACEMENT_CHAR);
       } 
       else {
	 if (UTF8ValueP) putchar(num);
	 else putu8(num);
	 if( (fmt == Dfmt) || (fmt == Hfmt) || (fmt == Qfmt)) {
	   if(*(str+NConsumed-1) != ';') MicrosoftStyle++;
	 }
       }
       free((void *)str);
     }
     goto done;
   }

   /* This is the case in which the Unicode escapes are embedded in ASCII text */

   LineNo = 0;
   while(fgets(lbuf,LBUFSIZE,stdin) != NULL) {
     LineNo++;
     iptr = lbuf;
     if(fmt == Jfmt) {
       while(*iptr) {
	 if(*iptr == '%') {
	   if(*++iptr) {
	     if(isxdigit(*iptr++)) {
	       if(*iptr) {
		 if(isxdigit(*iptr)) { /* match */
		   cbuf[2] = *(iptr-1);
		   cbuf[3] = *iptr;
		   cbuf[4] = '\0';
   /*		   fprintf(stderr,"cbuf = %s\n",cbuf); */
		   num = strtoul(cbuf,NULL,16);
		   putchar(num);
		   TokenNumber++;
		   iptr++;
		 }
		 else {		/* We have % X foo */
		   putchar('%');
		   putchar(*(iptr-1));
		   if(*iptr != '%') putchar(*iptr++);
		   continue;
		 }
	       }
	       else {		/* We have % X EOL */
		 putchar('%');
		 putchar(*(iptr-1));
		 putchar('\n');
		 break;
	       }
	     }
	     else { 		/* We have % foo */
		 putchar('%');
		 if(*iptr != '%') putchar(*iptr++);
		 continue;
	     }
	   }	     
	   else {		/* We have % EOL */
	     putchar('%');
	     putchar('\n');
	     break;
	   }
	 }
	 else {
	   putchar(*iptr++);
	   continue;
	 }
       }
     } /* End of special case for J format */

     while (*iptr) { 
       if(BMPSplit) {
	 if(sscanf(iptr,SplitFormat,&SplitStr,&num,&NConsumed)) {
	   if( (num <= 0xFFFF) && (SplitStr[0] == 'U')) {
	     fprintf(stderr,_("Warning: the code \\U%1$08lX at line %2$d falls within the BMP.\n"),
		     num,LineNo);
	   }
	   if( (num > 0xFFFF) && (SplitStr[0] == 'u')) {
	     fprintf(stderr,_("Warning: the code \\u%1$08lX at line %2$d falls outside the BMP.\n"),
		     num,LineNo);
	   }
	   putu8(num);
	   iptr+=NConsumed;
	   TokenNumber++;
	 }
	 else putchar(*iptr++);
       }
       else if (fmt == Qfmt) {
	 if (AllHTMLP){
	   if(sscanf(iptr,aHfmt,&num,&NConsumed)) {
	     putu8(num);
	     iptr+=NConsumed;
	     if(*(iptr-1) != ';') MicrosoftStyle++;
	     TokenNumber++;
	     continue;
	   }
	   else if(sscanf(iptr,aDfmt,&num,&NConsumed)) {
	     putu8(num);
	     iptr+=NConsumed;
	     if(*(iptr-1) != ';') MicrosoftStyle++;
	     TokenNumber++;
	     continue;
	   }
	 }
	 if(sscanf(iptr,afmt,&enam,&NConsumed)) {
	   if( (num = LookupCodeForEntity(enam))) {
	     putu8(num);
	     iptr+=NConsumed;
	     if(*(iptr-1) != ';') MicrosoftStyle++;
	     TokenNumber++;
	   }
	   else {
	     fprintf(stderr,"ascii2uni: unknown HTML character entity \"&%s;\" at line %d\n",
		     enam,LineNo);
	     putu8(UNI_REPLACEMENT_CHAR);
	     iptr+=NConsumed;
	     ReplacedNumber++;
	   }
	 }
	 else putchar(*iptr++);
       } /* End of Qfmt case */
       else if( (Ofmt == fmt) || (Sfmt == fmt) || (Tfmt == fmt)) {
	 Converted=sscanf(iptr,afmt,&b1,&b2,&b3,&NConsumed);
	 switch(Converted)
	   {
	   case 3:
	     num = (((b1 * 256) + b2) * 256) + b3;
	     putu8(num);iptr+=NConsumed;
	     break;
	   case 2:
	     num = (b1 * 256) + b2;
	     putu8(num);iptr+=NConsumed;
	     break;
	   case 1:
	     num = b1;
	     putu8(num);iptr+=NConsumed;
	     break;
	   case 0:
	     putchar(*iptr++);
	     break;
	   default:
	     fprintf(stderr,_("The character encoded as %1$s at token %2$lu is outside the Unicode range.\n\tEmitting Unicode replacement character.\n"),
		     str,TokenNumber);
	     putu8(UNI_REPLACEMENT_CHAR);
	   }
	   TokenNumber++;
       }
       else {			/* Default - not BMPSplit, HTML, or byte format */
	  if(sscanf(iptr,afmt,&num,&NConsumed)) {
	   if (UTF8ValueP) putchar(num);
	   else putu8(num);
	   iptr+=NConsumed;
	   if(fmt == Hfmt) {
	     if(*(iptr-1) != ';') MicrosoftStyle++;
	   }
	   else if(fmt == Dfmt) {
	     if(*(iptr-1) != ';') MicrosoftStyle++;
	   }
	   TokenNumber++;
	 }
	 else putchar(*iptr++);
       }
     } /* Loop over current line */
   } /* Loop over input lines */

done:
   if(VerboseP) {
     fprintf(stderr,_("%ld tokens converted\n"),TokenNumber);
     fprintf(stderr,_("%ld tokens replaced with Unicode Replacement Character\n"),ReplacedNumber);
     if(MicrosoftStyle) {
       fprintf(stderr,
	       _("%ld Microsoft-style (lacking final semi-colon)\n"),MicrosoftStyle);
     }
   }
   exit(SUCCESS);
}

