/* Search/replace functions (with and without regular expressions).

	Copyright (C) 1993-1998 Sebastiano Vigna 
	Copyright (C) 1999-2004 Todd M. Lewis and Sebastiano Vigna

	This file is part of ne, the nice editor.

	This program is free software; you can redistribute it and/or modify it
	under the terms of the GNU General Public License as published by the
	Free Software Foundation; either version 2, or (at your option) any
	later version.
	
	This program is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
	General Public License for more details.
	
	You should have received a copy of the GNU General Public License along
	with this program; see the file COPYING.  If not, write to the Free
	Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
	02111-1307, USA.  */


#include "ne.h"
#include "regex.h"


/* This is the initial allocation size for regex.library. */

#define START_BUFFER_SIZE 4096


/* This array is used both by the Boyer-Moore algorithm and by the regex
library. It is updated if b->find_string_changed is TRUE (it should always
be the first time the string is searched for). */

static unsigned int d[256];

/* This macro upper cases a character or not, depending on the boolean
sense_case. It is used in find(). Note that the argument *MUST* be unsigned. */

#define CONV(c)	(sense_case ? c : up_case[c])


/* Performs a search for the given pattern with a simplified Boyer-Moore
   algorithm starting at the given position, in the given direction, skipping a
   possible match at the current cursor position if skip_first is TRUE. If dir
   is 0, it is fetched from b->opt.search_back. If pattern is NULL, it is
   fetched from b->find_string. In this case, b->find_string_changed is
   checked, and, if it is FALSE, the string is not recompiled. Please check to
   set b->find_string_changed whenever a new string is set in
   b->find_string. The cursor is moved on the occurrence position if a match is
   found. */

int find(buffer * const b, const unsigned char *pattern, int dir, int skip_first) {

	int sense_case = (b->opt.case_search != 0);
	unsigned char c, first_char;
	unsigned char *p;
	long i, m;
	int recompile_string;
	long y;
	line_desc *ld;

	if (!pattern) {
		pattern = b->find_string;
		recompile_string = b->find_string_changed || b->last_was_regexp;
	}
	else recompile_string = TRUE;

	if (!dir) dir = b->opt.search_back ? -1 : 1;

	if (!pattern || !(m = strlen(pattern))) return ERROR;

	if (recompile_string) for(i = 0; i < sizeof d / sizeof *d; i++) d[i] = m;

	ld = b->cur_line_desc;
	y = b->cur_line;

	if (dir > 0) {

		if (recompile_string) {
			for(i = 0; i < m - 1; i++) d[CONV(pattern[i])] = m - i-1;
			b->find_string_changed = 0;
		}

		p = ld->line + b->cur_pos + m - 1 + (skip_first ? 1 : 0);
		first_char = CONV(pattern[m - 1]);

		while(y < b->num_lines) {

			assert(ld->ld_node.next != NULL);

			if (ld->line_len >= m) {

				while((p - ld->line) < ld->line_len) {

					if ((c = CONV(*p)) != first_char) p+=d[c];
					else {
						for (i = 1; i < m; i++)
							if (CONV(*(p - i)) != CONV(pattern[m - i-1])) {
								p+=d[c];
								break;
							}
						if (i == m) {
							goto_line(b, y);
							goto_pos(b, (p - ld->line) - m + 1);
							return OK;
						}
					}
				}
			}

			ld = (line_desc *)ld->ld_node.next;
			if (ld->ld_node.next) p = ld->line + m-1;
			y++;
		}
	}
	else {

		if (recompile_string) {
			for(i = m - 1; i > 0; i--) d[(unsigned char)CONV(pattern[i])] = i;
			b->find_string_changed = 0;
		}

		p = ld->line + (b->cur_pos > ld->line_len - m ? ld->line_len - m : b->cur_pos + (skip_first ? -1 : 0));
		first_char = CONV(pattern[0]);

		while(y >= 0) {

			assert(ld->ld_node.prev != NULL);

			if (ld->line_len >= m) {

				while((p - ld->line) >= 0) {

					if ((c = CONV(*p)) != first_char) p-=d[c];
					else {
						for (i = 1; i < m; i++)
							if (CONV(*(p + i)) != CONV(pattern[i])) {
								p-=d[c];
								break;
							}
						if (i == m) {
							goto_line(b, y);
							goto_pos(b, p - ld->line);
							return OK;
						}
					}
				}
			}

			ld = (line_desc *)ld->ld_node.prev;
			if (ld->ld_node.prev) p = ld->line + ld->line_len - m;
			y--;
		}
	}

	return NOT_FOUND;
}



/* Replaces n characters with the given string at the current cursor position,
   and then moves it to the end of the string. */

int replace(buffer * const b, const int n, const char * const string) {

	int len;

	assert(string != NULL);

	len = strlen(string);

	start_undo_chain(b);

	delete_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos, n);

	if (len) insert_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos, string, len);

	end_undo_chain(b);

	goto_pos(b, b->cur_pos + len);

	return OK;
}



/* The following variables are used by regex. In particular, re_reg holds
the stard/end of the extended replacement registers. */

static struct re_pattern_buffer re_pb;
static struct re_registers re_reg;

/* This string is used to replace the dot in UTF-8 searches. It will match only
 whole UTF-8 sequences. */

#define UTF8DOT "([\x01-\x7F\xC0-\xFF][\x80-\xBF]*)"

/* This string is prefixed to a complemented character class to force matches
	against UTF-8 non-US-ASCII characters.  It will match any UTF-8 sequence of
	length at least two, besides all characters expressed by the character
	class. Note that a closing ] and a closing ) must be appended. */

#define UTF8COMP "([\xC0-\xFF][\x80-\xBF]+|[^"

/* This string is used to replace non-word-constituents (\W) in UTF-8
 searches. It will match only whole UTF-8 sequences of non-word-constituent
 characters. */

#define UTF8NONWORD "([\x01-\x1E\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]|[\xC0-\xFF][\x80-\xBF]+)"

/* In UTF-8 text, the numbering of a parenthesised group may differ from the
   "official" one, due to the usage of parenthesis in UTF8DOT, UT8COMP and
   UTF8NONWORD. This array records for each user-invoked group the
   corresponding (usually larger) regex group. The group may be larger than
   RE_NREGS, in which case there is no way to recover it. */

static int map_group[RE_NREGS];

/* Works exactly like find(), but uses the regex library instead. */

int find_regexp(buffer * const b, const unsigned char *regex, int dir, int skip_first) {

	const unsigned char *p;
	int recompile_string, i, y, start_pos;
	line_desc *ld;

	if (!regex) {
		regex = b->find_string;
		recompile_string = b->find_string_changed || !b->last_was_regexp;
	}
	else recompile_string = TRUE;

	if (!dir) dir = b->opt.search_back ? -1 : 1;

	if (!regex || !strlen(regex)) return ERROR;

	if (re_pb.buffer == NULL) {
		if (re_pb.buffer = malloc(START_BUFFER_SIZE)) re_pb.allocated = START_BUFFER_SIZE;
		else return OUT_OF_MEMORY;
	}

	re_pb.fastmap = (void *)d;

	/* We have to be careful: even if the search string has not changed, it
	is possible that case sensitivity has. In this case, we force recompilation. */

	if (b->opt.case_search) {
		if (re_pb.translate != 0) recompile_string = TRUE;
		re_pb.translate = 0;
	}
	else {
		if (re_pb.translate != (char *)up_case) recompile_string = TRUE;
		re_pb.translate = (char *)up_case;
	}

	if (recompile_string) {
		const unsigned char *actual_regex = regex;

		/* If the buffer encoding is UTF-8, we need to replace dots with UTF8DOT,
			non-word-constituents (\W) with UTF8NONWORD, and embed complemented
			character classes in UTF8COMP, so that they do not match UTF-8
			subsequences. Moreover, we must compute the remapping from the virtual
			to the actual groups caused by the new groups thus introduced. */

		if (b->encoding == ENC_UTF8) {
			const unsigned char *s;
			unsigned char *q;
			int virtual_group = 0, real_group = 0, escape = FALSE, dots = 0, comps = 0, nonwords = 0;

			s = regex;

			/* We first scan regex to compute the exact number of characters of
				the actual (i.e., after substitutions) regex. */

			do {
				if (!escape) {
					if (*s == '.') dots++;
					else if (*s == '[') {
						if (*(s+1) == '^') {
							comps++;
							s++;
						}

						if (*(s+1) == ']') s++; /* A literal ]. */

						/* We scan the list up to ] and check that no non-US-ASCII characters appear. */
						do if (utf8len(*(++s)) != 1) return UTF8_REGEXP_CHARACTER_CLASS_NOT_SUPPORTED; while(*s && *s != ']');
					}
					else if (*s == '\\') {
						escape = TRUE;
						continue;
					}
				}
				else if (*s == 'W') nonwords++;
				escape = FALSE;
			} while(*(++s));

			actual_regex = q = malloc(strlen(regex) + 1 + (strlen(UTF8DOT) - 1) * dots + (strlen(UTF8NONWORD) - 2) * nonwords + (strlen(UTF8COMP) - 1) * comps);
			if (!actual_regex) return OUT_OF_MEMORY;
			s = regex;
			escape = FALSE;

			do {
				if (escape || *s != '.' && *s != '(' && *s != '[' && *s != '\\') {
					if (escape && *s == 'W') {
						q--;
						strcpy(q, UTF8NONWORD);
						q += strlen(UTF8NONWORD);
						real_group++;
					}
					else *(q++) = *s;
				}
				else {
					if (*s == '\\') {
						escape = TRUE;
						*(q++) = '\\';
						continue;
					}

					if (*s == '.') {
						strcpy(q, UTF8DOT);
						q += strlen(UTF8DOT);
						real_group++;
					}
					else if (*s == '(') {
						*(q++) = '(';
						if (virtual_group < RE_NREGS - 1) map_group[++virtual_group] = ++real_group;
					}
					else if (*s == '[') {
						if (*(s+1) == '^') {
							strcpy(q, UTF8COMP);
							q += strlen(UTF8COMP);
							s++;
							if (*(s+1) == ']') *(q++) = *(++s); /* A literal ]. */
							do	*(q++) = *(++s); while (*s && *s != ']');
							if (*s) *(q++) = ')';
							real_group++;
						}
						else {
							*(q++) = '[';
							if (*(s+1) == ']') *(q++) = *(++s); /* A literal ]. */
							do	*(q++) = *(++s); while (*s && *s != ']');
						}
					}
				}

				escape = FALSE;
			} while(*(s++));
			
			/* This assert may be false if a [ is not closed. */
			assert(strlen(actual_regex) == strlen(regex) + (strlen(UTF8DOT) - 1) * dots + (strlen(UTF8NONWORD) - 2) * nonwords + (strlen(UTF8COMP) - 1) * comps);
		}

		p = re_compile_pattern(actual_regex, strlen(actual_regex), &re_pb);

		if (b->encoding == ENC_UTF8) free((void*)actual_regex);

		if (p) {
			/* Here we have a very dirty hack: since we cannot return the error of
				regex, we print it here. Which means that we access term.c's
				functions. 8^( */
			print_message(p);
			ring_bell();
			return ERROR;
		}

	}

	b->find_string_changed = 0;

	ld = b->cur_line_desc;
	y = b->cur_line;

	if (dir > 0) {

		start_pos = b->cur_pos + (skip_first ? 1 : 0);

		while(y < b->num_lines) {

			assert(ld->ld_node.next != NULL);

			if (start_pos <= ld->line_len &&
				 (i = re_search(&re_pb, ld->line ? ld->line : (unsigned char *)"", ld->line_len, start_pos, ld->line_len - start_pos, &re_reg))>=0) {
				goto_line(b, y);
				goto_pos(b, i);
				return OK;
			}

			ld = (line_desc *)ld->ld_node.next;
			start_pos = 0;
			y++;
		}
	}
	else {

		start_pos = b->cur_pos + (skip_first ? -1 : 0);

		while(y >= 0) {

			assert(ld->ld_node.prev != NULL);

			if (start_pos >= 0 &&
				 (i =re_search(&re_pb, ld->line ? ld->line : (unsigned char *)"", ld->line_len, start_pos, -start_pos - 1, &re_reg))>=0) {
				goto_line(b, y);
				goto_pos(b, i);
				return OK;
			}

			ld = (line_desc *)ld->ld_node.prev;
			if (ld->ld_node.prev) start_pos = ld->line_len;
			y--;
		}
	}

	return NOT_FOUND;
}



/* Replaces a regular expression. The given string can contain \0, \1 etc. for
   the pattern matched by the i-th pair of brackets (\0 is the whole
   string). */

int replace_regexp(buffer * const b, const char * const string) {

	int i, pos, len, c, reg_used = FALSE;
	unsigned char *p, *q, *t = NULL;

	assert(string != NULL);

	if (q = p = str_dup(string)) {

		len = strlen(p);

		while(TRUE) {
			while(*q && *q != '\\') q++;

			if (!*q) break;

			i = *(q + 1) - '0';

			if (*(q + 1) == '\\') {
				memmove(q, q + 1, strlen(q + 1) + 1);
				q++;
				len--;
			}
			else if (i >= 0 && i < RE_NREGS && re_reg.start[i] >= 0) {
				if (b->encoding == ENC_UTF8) {
					/* In the UTF-8 case, the replacement group index must be
						mapped through map_group to recover the real group. */
					if ((i = map_group[i]) >= RE_NREGS) {
						free(p);
						return GROUP_NOT_AVAILABLE;
					}
				}
				*q++ = 0;
				*q++ = i;
				reg_used = TRUE;
			}
			else {
				free(p);
				return WRONG_CHAR_AFTER_BACKSLASH;
			}
		}

		if (reg_used) {
			if (t = malloc(re_reg.end[0] - re_reg.start[0] + 1)) {
				memcpy(t, b->cur_line_desc->line + re_reg.start[0], re_reg.end[0] - re_reg.start[0]);
				t[re_reg.end[0] - re_reg.start[0]] = 0;
			}
			else {
				free(p);
				return OUT_OF_MEMORY;
			}
		}

		for(i = RE_NREGS - 1; i >= 0; i--) {
			re_reg.end[i] -= re_reg.start[0];
			re_reg.start[i] -= re_reg.start[0];
		}

		start_undo_chain(b);

		delete_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos, re_reg.end[0]);

		q = p;
		pos = 0;

		while(TRUE) {
			if (strlen(q)) {
				insert_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos + pos, q, strlen(q));
				pos += strlen(q);
			}

			q += strlen(q) + 1;

			if (q - p > len) break;

			if (re_reg.end[*q] - re_reg.start[*q]) {

				c = t[re_reg.end[*q]];
				t[re_reg.end[*q]] = 0;

				insert_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos + pos, t + re_reg.start[*q], re_reg.end[*q] - re_reg.start[*q]);

				t[re_reg.end[*q]] = c;

				pos += re_reg.end[*q] - re_reg.start[*q];
			}

			q++;
		}

		end_undo_chain(b);

		goto_pos(b, b->cur_pos + pos);
		if (pos == 0 && re_reg.end[0] == 0) char_right(b);

		free(t);
		free(p);
	}
	else return OUT_OF_MEMORY;

	return OK;
}
