/*
 * A C++ scanner. This example uses the new (as of 4.0) and still experimental
 * longest match construction method.
 *
 * << <= <<= >> >= >>= are left out since angle brackets are used in templates.
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define TK_Dlit 256
#define TK_Slit 257
#define TK_Float 258
#define TK_Id 259
#define TK_NameSep 260
#define TK_Arrow 261
#define TK_PlusPlus 262
#define TK_MinusMinus 263
#define TK_ArrowStar 264
#define TK_DotStar 265
#define TK_ShiftLeft 266
#define TK_ShiftRight 267
#define TK_IntegerDecimal 268
#define TK_IntegerOctal 269
#define TK_IntegerHex 270
#define TK_EqualsEquals 271
#define TK_NotEquals 272
#define TK_AndAnd 273
#define TK_OrOr 274
#define TK_MultAssign 275
#define TK_DivAssign 276
#define TK_PercentAssign 277
#define TK_PlusAssign 278
#define TK_MinusAssign 279
#define TK_AmpAssign 280
#define TK_CaretAssign 281
#define TK_BarAssign 282
#define TK_DotDotDot 283
#define TK_Whitespace 284
#define TK_Comment 285

#define BUFSIZE 16384

/* EOF Char used to flush out that last token. This should be a whitespace
 * token. In this implementation of a scanner the last token is always ignored,
 * so enforce that the last token is always whitespace. */

#define LAST_CHAR 0

char buf[BUFSIZE];
int line = 1, col = 1;

void token( int tok, char *data, int len )
{
	printf( "<%i> ", tok );
	for ( int i = 0; i < len; i++ )
		fputc( data[i], stdout );
	fputc( '\n', stdout );
	
	/* Count newlines and columns. This code is here mainly for having some
	 * code in the token routine when commenting out the above output during
	 * performance testing. */
	for ( int i = 0; i < len; i ++ ) {
		if ( data[i] == '\n' ) {
			line += 1;
			col = 1;
		}
		else {
			col += 1;
		}
	}
}

struct Scanner
{
	int act;
	char *tokstart, *tokend;
	int curs;
	%% interface;
};

%% Scanner
{
	init { 
		act = 0;
		tokend = 0;
	}

	# Floating literals.
	fract_const = digit* '.' digit+ | digit+ '.';
	exponent = [eE] [+\-]? digit+;
	float_suffix = [flFL];

	main := |*

	# Single and double literals.
	( 'L'? "'" ( [^'\\\n] | /\\./ )* "'" ) {token( TK_Slit, tokstart, tokend-tokstart+1 );};
	( 'L'? '"' ( [^"\\\n] | /\\./ )* '"' ) {token( TK_Dlit, tokstart, tokend-tokstart+1 );};

	# Identifiers
	( [a-zA-Z_] [a-zA-Z0-9_]* ) {token( TK_Id, tokstart, tokend-tokstart+1 );};

	# Floating literals.
	( fract_const exponent? float_suffix? |
		digit+ exponent float_suffix? ) {token( TK_Float, tokstart, tokend-tokstart+1 );};
	
	# Integer decimal. Leading part buffered by float.
	( ( '0' | [1-9] [0-9]* ) [ulUL]{0,3} ) {token( TK_IntegerDecimal, tokstart, tokend-tokstart+1 );};

	# Integer octal. Leading part buffered by float.
	( '0' [0-9]+ [ulUL]{0,2} ) {token( TK_IntegerOctal, tokstart, tokend-tokstart+1 );};

	# Integer hex. Leading 0 buffered by float.
	( '0' ( 'x' [0-9a-fA-F]+ [ulUL]{0,2} ) ) {token( TK_IntegerHex, tokstart, tokend-tokstart+1 );};

	# Only buffer the second item, first buffered by symbol. */
	'::' {token( TK_NameSep, tokstart, tokend-tokstart+1 );};
	'==' {token( TK_EqualsEquals, tokstart, tokend-tokstart+1 );};
	'!=' {token( TK_NotEquals, tokstart, tokend-tokstart+1 );};
	'&&' {token( TK_AndAnd, tokstart, tokend-tokstart+1 );};
	'||' {token( TK_OrOr, tokstart, tokend-tokstart+1 );};
	'*=' {token( TK_MultAssign, tokstart, tokend-tokstart+1 );};
	'/=' {token( TK_DivAssign, tokstart, tokend-tokstart+1 );};
	'%=' {token( TK_PercentAssign, tokstart, tokend-tokstart+1 );};
	'+=' {token( TK_PlusAssign, tokstart, tokend-tokstart+1 );};
	'-=' {token( TK_MinusAssign, tokstart, tokend-tokstart+1 );};
	'&=' {token( TK_AmpAssign, tokstart, tokend-tokstart+1 );};
	'^=' {token( TK_CaretAssign, tokstart, tokend-tokstart+1 );};
	'|=' {token( TK_BarAssign, tokstart, tokend-tokstart+1 );};
	'++' {token( TK_PlusPlus, tokstart, tokend-tokstart+1 );};
	'--' {token( TK_MinusMinus, tokstart, tokend-tokstart+1 );};
	'->' {token( TK_Arrow, tokstart, tokend-tokstart+1 );};
	'->*' {token( TK_ArrowStar, tokstart, tokend-tokstart+1 );};
	'.*' {token( TK_DotStar, tokstart, tokend-tokstart+1 );};

	# Three char compounds, first item already buffered. */
	'...' {token( TK_DotDotDot, tokstart, tokend-tokstart+1 );};

	# Single char symbols.
	( punct - [_"'] ) {token( tokstart[0], tokstart, tokend-tokstart+1 );};

	# Comments and whitespace.
	'/*' ( any* $0 '*/' @1 ) {};
	'//' ( any* $0 '\n' @1 ) {};
	( any - 33..126 )+ {};

	*|;
}


int main()
{
	Scanner scanner;
	scanner.init();

	/* Do the first read. */
	int have = 0;
	bool sentLastChar = false;

	/* Tokstart needs to be set up. */
	scanner.tokstart = buf;

	while ( true ) {
		int newd = fread( buf+have, 1, BUFSIZE-have, stdin );
		if ( newd == 0 ) {
			if ( sentLastChar )
				break;
			else {
				/* Push the last character. Note that there is always at least
				 * one free spot. */
				sentLastChar = true;
				buf[have] = LAST_CHAR;
				newd = 1;
			}
		}

		int len = have + newd;
		int rtn = scanner.execute( buf+have, newd );
		if ( rtn < 0 ) {
			/* Machine failed before finding a token. */
			fprintf(stderr, "PARSE ERROR\n" );
			exit(1);
		}
		else if ( scanner.tokstart == buf && len == BUFSIZE ) {
			/* No failure yet, buffer is full. */
			fprintf(stderr, "TOKEN TOO BIG\n" );
			exit(1);
		}
		else {
			/* No failure yet, room still left in buffer. Shift over data and
			 * read more. */
			have = len - (scanner.tokstart-buf);
			memmove( buf, scanner.tokstart, have );
			scanner.tokend -= (scanner.tokstart-buf);
			scanner.tokstart = buf;
		}
	}

	return 0;
}
