/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>
#include <ctype.h>
#ifdef   HAVE_UNISTD_H
#include <unistd.h>
#endif

#ifdef HAVE_WINSOCK_H
#include <winsock.h>
#endif
#ifdef HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif


#include "udm_common.h"
#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_spell.h"
#include "udm_xmalloc.h"
#include "udm_word.h"
#include "udm_synonym.h"
#include "udm_hash.h"
#include "udm_vars.h"

#define MAXNORMLEN 56
#define MAX_NORM 512
#define ERRSTRSIZE 100

typedef struct {
  UDM_SPELL **cur;
  size_t nspell;
} UDM_PSPELL;


/*#define DEBUG_UNIREG*/

/* Unicode regex lite BEGIN */

static const int * UdmUniRegTok(const int * s,const int **  last){
	if(s == NULL && (s=*last) == NULL)
		return NULL;

	switch(*s){
		case 0:
			return(NULL);
			break;
		case '[':
			for(*last=s+1;(**last)&&(**last!=']');(*last)++);
			if(**last==']')(*last)++;
			break;
		case '$':
		case '^':
			*last=s+1;
			break;
		default:
			for(*last=s+1;(**last)&&(**last!=']')&&(**last!='[')&&(**last!='^')&&(**last!='$');(*last)++);
			break;
	}
	return s;
}

static int UdmUniRegComp(UDM_UNIREG_EXP *reg, const int *pattern){
	const int * tok, * lt;

	reg->ntokens=0;
	reg->Token=NULL;

	tok=UdmUniRegTok(pattern,&lt);
	while(tok){
		size_t len;
		reg->Token=(UDM_UNIREG_TOK*)UdmRealloc(reg->Token,sizeof(*reg->Token)*(reg->ntokens+1));
		len=lt-tok;
		reg->Token[reg->ntokens].str=(int*)UdmMalloc((len+1)*sizeof(int));
		memcpy(reg->Token[reg->ntokens].str,tok,len*sizeof(int));
		reg->Token[reg->ntokens].str[len]=0;
		
		reg->ntokens++;
		tok=UdmUniRegTok(NULL,&lt);
	}
	return 0;
}


static int UdmUniRegExec(const UDM_UNIREG_EXP *reg, const int *string){
	const int * start=string;
	int match=0;
#ifdef DEBUG_UNIREG
	UDM_CHARSET *k = UdmGetCharSet("koi8-r");
	UDM_CHARSET *sy = UdmGetCharSet("sys-int");
	UDM_CONV fromuni;
	char sstr[1024];
	char rstr[1024];
	
	UdmConvInit(&fromuni,sy,k,0);
#endif
	
	for(start=string;*start;start++){
		const int * tstart=start;
		size_t i;
		
		for(i=0;i<reg->ntokens;i++){
			const int * s;
			int inc=UDM_UNIREG_INC;
#ifdef DEBUG_UNIREG

			UdmConv(&fromuni, sstr, 1024, (char*)tstart, 1024);
			UdmConv(&fromuni, rstr, 1024, (char*)reg->Token[i].str, 1024);
			printf("t:%d tstart='%s'\ttok='%s'\t", i, sstr, rstr);
#endif
			switch(reg->Token[i].str[0]){
				case '^':
					if(string!=tstart){
						match=0;
					}else{	
						match=1;
					}
					break;
				case '[':
					match=0;
					for(s=reg->Token[i].str+1;*s;s++){
						if(*s==']'){
						}else
						if(*s=='^'){
							inc=UDM_UNIREG_EXC;
							match=1;
						}else{
							if((*tstart==*s)&&(inc==UDM_UNIREG_EXC)){
								match=0;
								break;
							}
							if((*tstart==*s)&&(inc==UDM_UNIREG_INC)){
								match=1;
								break;
							}
						}
					}
					tstart++;
					break;
				case '$':
					if(*tstart!=0){
						match=0;
					}else{
						match=1;
					}
					break;
				default:
					match=1;
					for(s=reg->Token[i].str;(*s)&&(*tstart);s++,tstart++){
						if(*s=='.'){
							/* Any char */
						}else
						if((*s)!=(*tstart)){
							match=0;
							break;
						}
					}
					if((*s)&&(!*tstart))match=0;
					break;
			}
#ifdef DEBUG_UNIREG
			printf("match=%d\n",match);
#endif
			if(!match)break;
		}
		if(match)break;
	}

#ifdef DEBUG_UNIREG
	printf("return match=%d\n",match);
#endif
	return match;

}


static void UdmUniRegFree(UDM_UNIREG_EXP *reg){
	size_t i;
	
	for(i=0;i<reg->ntokens;i++)
		if(reg->Token[i].str)
			UDM_FREE(reg->Token[i].str);

	UDM_FREE(reg->Token);
}


/* Unicode regex lite END */


void UdmUniRegCompileAll(UDM_ENV *Conf) {
        size_t i;
	
	for(i = 0; i < Conf->Affixes.naffixes; i++) {
	  if(!UdmUniRegComp(&Conf->Affixes.Affix[i].reg, Conf->Affixes.Affix[i].mask)) {
	    Conf->Affixes.Affix[i].compile = 0;
	  }
	}
}

static int cmpspellword(int *w1, const int *w2) {
    int u1[BUFSIZ], u2[BUFSIZ];
    int res;
#ifdef USE_PARANOIA
    void *paran = UdmViolationEnter();
#endif
    UdmUniStrCpy(u1, w1); *u1 &= 255;
    UdmUniStrCpy(u2, w2); *u2 &= 255;
    res = UdmUniStrCmp(u1, u2);
#ifdef USE_PARANOIA
    UdmViolationExit(paran);
#endif
    return res; 
}

static int cmpspell(const void *s1,const void *s2){
  int lc;
  lc = strcmp(((const UDM_SPELL*)s1)->lang,((const UDM_SPELL*)s2)->lang);
  if (lc == 0) {
    lc = cmpspellword(((const UDM_SPELL*)s1)->word, ((const UDM_SPELL*)s2)->word );
  }
  return lc;
}

static int cmpaffix(const void *s1,const void *s2){
  int lc;
  int res;
  int u1[BUFSIZ], u2[BUFSIZ];
#ifdef USE_PARANOIA
  void *paran = UdmViolationEnter();
#endif
  if (((const UDM_AFFIX*)s1)->type < ((const UDM_AFFIX*)s2)->type) {
#ifdef USE_PARANOIA
    UdmViolationExit(paran);
#endif
    return -1;
  }
  if (((const UDM_AFFIX*)s1)->type > ((const UDM_AFFIX*)s2)->type) {
#ifdef USE_PARANOIA
    UdmViolationExit(paran);
#endif
    return 1;
  }
  lc = strcmp(((const UDM_AFFIX*)s1)->lang,((const UDM_AFFIX*)s2)->lang);
  if (lc == 0) {
    if ( (((const UDM_AFFIX*)s1)->replen == 0) && (((const UDM_AFFIX*)s2)->replen == 0) ) {
#ifdef USE_PARANOIA
      UdmViolationExit(paran);
#endif
      return 0;
    }
    if (((const UDM_AFFIX*)s1)->replen == 0) {
#ifdef USE_PARANOIA
      UdmViolationExit(paran);
#endif
      return -1;
    }
    if (((const UDM_AFFIX*)s2)->replen == 0) {
#ifdef USE_PARANOIA
      UdmViolationExit(paran);
#endif
      return 1;
    }
    
    UdmUniStrCpy(u1,((const UDM_AFFIX*)s1)->repl); 
    UdmUniStrCpy(u2,((const UDM_AFFIX*)s2)->repl); 
    if (((const UDM_AFFIX*)s1)->type == 'p') {
      *u1 &= 255; *u2 &= 255;
      res = UdmUniStrCmp(u1, u2);
#ifdef USE_PARANOIA
      UdmViolationExit(paran);
#endif
      return res;
    } else {
      u1[((const UDM_AFFIX*)s1)->replen - 1] &= 255; u2[((const UDM_AFFIX*)s2)->replen -1] &= 255;
      res = UdmUniStrBCmp(u1, u2);
#ifdef USE_PARANOIA
      UdmViolationExit(paran);
#endif
      return res;
    }
  }
  return lc;
}

int UdmSpellAdd(UDM_SPELLLIST *List,const int * word,const char *flag,const char *lang){
	if(List->nspell>=List->mspell){
		List->mspell+=1024*20;
		List->Spell=(UDM_SPELL *)UdmXrealloc(List->Spell,List->mspell*sizeof(UDM_SPELL));
	}
	List->Spell[List->nspell].word = UdmUniDup(word);
	strncpy(List->Spell[List->nspell].flag,flag,10);
	strncpy(List->Spell[List->nspell].lang,lang,32);
	List->Spell[List->nspell].lang[32] = List->Spell[List->nspell].flag[10] = '\0';
	List->nspell++;
	return(0);
}


__C_LINK int __UDMCALL UdmImportDictionary(UDM_ENV * Conf, const char *lang, const char *charset,
				   const char *filename, int skip_noflag, const char *first_letters){
	char *str;	
	char *lstr;	
	int *ustr;	
	FILE *dict;
	UDM_CHARSET *sys_int;
	UDM_CHARSET *dict_charset;
	UDM_CONV touni;
	UDM_CONV fromuni;

	if ((str = (char*) UdmMalloc(1024)) == NULL) return 1; 
	if ((lstr = (char*) UdmMalloc(2048)) == NULL) {
	  UDM_FREE(str);
	  return 1; 
	}
	if ((ustr = (int*) UdmMalloc(8192)) == NULL) {
	  UDM_FREE(str);
	  UDM_FREE(lstr);
	  return 1; 
	}

	dict_charset = UdmGetCharSet(charset);
	sys_int = UdmGetCharSet("sys-int");
	if ((dict_charset == NULL) || (sys_int == NULL)) {
	  UDM_FREE(str);
	  UDM_FREE(lstr);
	  UDM_FREE(ustr);
	  return 1;
	}
	
	UdmConvInit(&touni,dict_charset,sys_int,0);
	UdmConvInit(&fromuni,sys_int,dict_charset,0);
	
	if(!(dict=fopen(filename,"r")))return(1);
	while(fgets(str, 1024, dict)){
		char *s;
		const char *flag;
		int res;
		
	        flag = NULL;
		s = str;
		while(*s){
			if(*s == '\r') *s = '\0';
			if(*s == '\n') *s = '\0';
			s++;
		}
		if((s=strchr(str,'/'))){
			*s=0;
			s++;flag=s;
			while(*s){
				if(((*s>='A')&&(*s<='Z'))||((*s>='a')&&(*s<='z')))s++;
				else{
					*s=0;
					break;
				}
			}
		}else{
			if(skip_noflag)	continue;
			flag="";
		}

		res = UdmConv(&touni, (char*)ustr, 8192, str, 1024);
		UdmUniStrToLower(ustr);

		/* Dont load words if first letter is not required */
		/* It allows to optimize loading at  search time   */
		if(*first_letters) {
			UdmConv(&fromuni, lstr, 2048, ((const char*)ustr),(size_t)res);
			if(!strchr(first_letters,lstr[0]))
				continue;
		}
		UdmSpellAdd(&Conf->Spells,ustr,flag,lang);
	}
	fclose(dict);
	UDM_FREE(str);
	UDM_FREE(lstr);
	UDM_FREE(ustr);
	return(0);
}




static UDM_SPELL ** UdmFindWord(UDM_AGENT * Indexer, const int *word, int affixflag, UDM_PSPELL *PS) {
  int l,c,r,resc,resl,resr, nlang = -1 /*Indexer->spellang FIXME: if Language limit issued in query */, 
    li, li_from, li_to, i;
  UDM_SPELLLIST *SpellList=&Indexer->Conf->Spells;
  
        if (nlang == -1) {
	  li_from = 0; li_to = SpellList->nLang;
	} else {
	  li_from = nlang; li_to = li_from + 1;
	}
	if (Indexer->Conf->Spells.nspell) {
	  i = (int)(*word) & 255;
	  for(li = li_from; li < li_to; li++) {
	    l = SpellList->SpellTree[li].Left[i];
	    r = SpellList->SpellTree[li].Right[i];
	    if (l == -1) continue;
	    while(l<=r){
		    c = (l + r) >> 1;
		    resc = cmpspellword(SpellList->Spell[c].word, word);
		    if( (resc == 0) && 
			((affixflag == 0) || (strchr(SpellList->Spell[c].flag, affixflag) != NULL)) ) {
		      if (PS->nspell < MAX_NORM - 1) {
			PS->cur[PS->nspell] = &SpellList->Spell[c];
			PS->nspell++;
			PS->cur[PS->nspell] = NULL;
		      }
		      break;
		    }
		    resl = cmpspellword(SpellList->Spell[l].word, word);
		    if( (resl == 0) && 
			((affixflag == 0) || (strchr(SpellList->Spell[l].flag, affixflag) != NULL)) ) {
		      if (PS->nspell < MAX_NORM - 1) {
			PS->cur[PS->nspell] = &SpellList->Spell[l];
			PS->nspell++;
			PS->cur[PS->nspell] = NULL;
		      }
		      break;
		    }
		    resr = cmpspellword(SpellList->Spell[r].word, word);
		    if( (resr == 0) && 
			((affixflag == 0) || (strchr(SpellList->Spell[r].flag, affixflag) != NULL)) ) {
		      if (PS->nspell < MAX_NORM - 1) {
			PS->cur[PS->nspell] = &SpellList->Spell[r];
			PS->nspell++;
			PS->cur[PS->nspell] = NULL;
		      }
		      break;
		    }
		    if(resc < 0){
			    l = c + 1;
			    r--;
		    } else if(resc > 0){
			    r = c - 1;
			    l++;
		    } else {
		      l++;
		      r--;
		    }
	    }
	  }
	}

	return PS->cur;
}


int UdmAffixAdd(UDM_AFFIXLIST *List,int flag, const char * lang, const int *mask,const int *find, const int *repl, int type) {

	if(List->naffixes>=List->maffixes){
		List->maffixes+=16;
		List->Affix = UdmXrealloc(List->Affix,List->maffixes*sizeof(UDM_AFFIX));
	}

	List->Affix[List->naffixes].compile = 1;
	List->Affix[List->naffixes].flag=flag;
	List->Affix[List->naffixes].type=type;
	strncpy(List->Affix[List->naffixes].lang, lang, 32);
	List->Affix[List->naffixes].lang[32] = 0;
	
	UdmUniStrNCpy(List->Affix[List->naffixes].mask, mask, 40);
	UdmUniStrNCpy(List->Affix[List->naffixes].find,find, 15);
	UdmUniStrNCpy(List->Affix[List->naffixes].repl,repl, 15);
	
	List->Affix[List->naffixes].replen  = UdmUniLen(repl);
	List->Affix[List->naffixes].findlen = UdmUniLen(find);
	List->naffixes++;
	return(0);
}

static char * remove_spaces(char *dist,char *src){
char *d,*s;
	d=dist;
	s=src;
	while(*s){
		if((*s != ' ')&& (*s != '-') && (*s != '\t')){
			*d=*s;
			d++;
		}
		s++;
	}
	*d=0;
	return(dist);
}


__C_LINK int __UDMCALL UdmImportAffixes(UDM_ENV * Conf,const char *lang, const char*charset, 
				const char *filename) {
char str[BUFSIZ];
char flag=0;
char mask[8*BUFSIZ]="";
char find[8*BUFSIZ]="";
char repl[8*BUFSIZ]="";
char *s;
int i;
int suffixes=0;
int prefixes=0;
int IspellUsePrefixes;
FILE *affix;
 UDM_CHARSET *affix_charset = NULL;
 int umask[BUFSIZ];
 int ufind[BUFSIZ];
 int urepl[BUFSIZ];
 size_t len;
 UDM_CHARSET *sys_int;
 UDM_CONV touni;
#ifdef DEBUG_UNIREG
 UDM_CONV fromuni;
#endif
#ifdef USE_PARANOIA
 void *paran = UdmViolationEnter();
#endif

            if(!(affix=fopen(filename,"r"))) {
#ifdef USE_PARANOIA
	      UdmViolationExit(paran);
#endif
	      return 1;
	    }

	    affix_charset = UdmGetCharSet(charset);
	    if (affix_charset == NULL) {
#ifdef USE_PARANOIA
	      UdmViolationExit(paran);
#endif
	      return 1;
	    }
	    sys_int = UdmGetCharSet("sys-int");
	    if (sys_int == NULL) {
#ifdef USE_PARANOIA
	      UdmViolationExit(paran);
#endif
	      return 1;
	    }
	    
	    UdmConvInit(&touni,affix_charset,sys_int,0);
#ifdef DEBUG_UNIREG
	    UdmConvInit(&fromuni,sys_int,affix_charset,0);
#endif

	    IspellUsePrefixes = strcasecmp(UdmVarListFindStr(&Conf->Vars,"IspellUsePrefixes","no"),"no");

	    while(fgets(str,sizeof(str),affix)){
		    if(!strncasecmp(str,"suffixes",8)){
		    	    suffixes=1;
			    prefixes=0;
			    continue;
		    }
		    if(!strncasecmp(str,"prefixes",8)){
			    suffixes=0;
			    prefixes=1;
			    continue;
		    }
		    if(!strncasecmp(str,"flag ",5)){
			    s=str+5;
			    while(strchr("* ",*s))s++;
			    flag=*s;
			    continue;
		    }
		    if((!suffixes)&&(!prefixes))continue;
		    if((prefixes)&&(!IspellUsePrefixes)) continue;
		
		    if((s=strchr(str,'#')))*s=0;
		    if(!*str)continue;

		    strcpy(mask,"");
		    strcpy(find,"");
		    strcpy(repl,"");

		    i=sscanf(str,"%[^>\n]>%[^,\n],%[^\n]",mask,find,repl);

		    remove_spaces(str,repl);strcpy(repl,str);
		    remove_spaces(str,find);strcpy(find,str);
		    remove_spaces(str,mask);strcpy(mask,str);

		    switch(i){
		    case 3:break;
		    case 2:
			    if(*find != '\0'){
				    strcpy(repl,find);
				    strcpy(find,"");
			    }
			    break;
		    default:
			    continue;
		    }

		    len=UdmConv(&touni,(char*)urepl,sizeof(urepl),repl,strlen(repl)+1);
		    UdmUniStrToLower(urepl);
#ifdef DEBUG_UNIREG
		    UdmConv(&fromuni,repl,sizeof(repl),(char *)urepl,len);
#endif
		    
		    len=UdmConv(&touni,(char*)ufind,sizeof(ufind),find,strlen(find)+1);
		    UdmUniStrToLower(ufind);
#ifdef DEBUG_UNIREG
		    UdmConv(&fromuni,find,sizeof(find),(char*)ufind,len);
#endif

		    if (suffixes) {
		      sprintf(str, "%s$", mask);
		    } else {
		      sprintf(str, "^%s", mask);
		    }

		    len = UdmConv(&touni, (char*)umask, sizeof(umask), str, strlen(str) + 1);
		    UdmUniStrToLower(umask);
#ifdef DEBUG_UNIREG
		    UdmConv(&fromuni,mask,sizeof(mask),(char*)umask,len);
#endif

		    UdmAffixAdd(&Conf->Affixes,(int)flag,lang,umask,ufind,urepl,suffixes?'s':'p');
	    }
	    fclose(affix);
	    
#ifdef USE_PARANOIA
	    UdmViolationExit(paran);
#endif
	    return 0;
}

__C_LINK void __UDMCALL UdmSortDictionary(UDM_SPELLLIST * List){
  int  j, CurLet = -1, Let;size_t i;
  char *CurLang = NULL;

        UdmSort((void*)List->Spell,List->nspell,sizeof(UDM_SPELL),cmpspell);
	for(i = 0; i < List->nspell; i++) {
	  if (CurLang == NULL || strncmp(CurLang, List->Spell[i].lang, 2) != 0) {
	    CurLang = List->Spell[i].lang;
	    strncpy(List->SpellTree[List->nLang].lang, CurLang, 2);
	    List->SpellTree[List->nLang].lang[3] = 0;
	    for(j = 0; j < 256; j++)
	      List->SpellTree[List->nLang].Left[j] =
		List->SpellTree[List->nLang].Right[j] = -1;
	    if (List->nLang > 0) {
	      CurLet = -1;
	    }
	    List->nLang++;
	  }
	  Let = (int)(*(List->Spell[i].word)) & 255;
	  if (CurLet != Let) {
	    List->SpellTree[List->nLang-1].Left[Let] = i;
	    CurLet = Let;
	  }
	  List->SpellTree[List->nLang-1].Right[Let] = i;
	}
}

__C_LINK void __UDMCALL UdmSortAffixes(UDM_AFFIXLIST *List, UDM_SPELLLIST *SL) {
  int  CurLetP = -1, CurLetS = -1, Let, cl = -1;
  char *CurLangP = NULL, *CurLangS = NULL;
  UDM_AFFIX *Affix; size_t i, j;

  if (List->naffixes > 1)
    UdmSort((void*)List->Affix,List->naffixes,sizeof(UDM_AFFIX),cmpaffix);

  for(i = 0; i < SL->nLang; i++)
    for(j = 0; j < 256; j++) {
      List->PrefixTree[i].Left[j] = List->PrefixTree[i].Right[j] = -1;
      List->SuffixTree[i].Left[j] = List->SuffixTree[i].Right[j] = -1;
    }

  for(i = 0; i < List->naffixes; i++) {
    Affix = &(((UDM_AFFIX*)List->Affix)[i]);
    if(Affix->type == 'p') {
      if (CurLangP == NULL || strcmp(CurLangP, Affix->lang) != 0) {
	cl = -1;
	for (j = 0; j < SL->nLang; j++) {
	  if (strncmp(SL->SpellTree[j].lang, Affix->lang, 2) == 0) {
	    cl = j;
	    break;
	  }
	}
	CurLangP = Affix->lang;
	strcpy(List->PrefixTree[cl].lang, CurLangP);
	CurLetP = -1;
      }
      if (cl < 0) continue; /* we have affixes without spell for this lang */
      Let = (int)(*(Affix->repl)) & 255;
      if (CurLetP != Let) {
	List->PrefixTree[cl].Left[Let] = i;
	CurLetP = Let;
      }
      List->PrefixTree[cl].Right[Let] = i;
    } else {
      if (CurLangS == NULL || strcmp(CurLangS, Affix->lang) != 0) {
	cl = -1;
	for (j = 0; j < SL->nLang; j++) {
	  if (strcmp(SL->SpellTree[j].lang, Affix->lang) == 0) {
	    cl = j;
	    break;
	  }
	}
	CurLangS = Affix->lang;
	strcpy(List->SuffixTree[cl].lang, CurLangS);
	CurLetS = -1;
      }
      if (cl < 0) continue; /* we have affixes without spell for this lang */
      Let = (Affix->replen) ? (int)(Affix->repl[Affix->replen-1]) & 255 : 0;
      if (CurLetS != Let) {
	List->SuffixTree[cl].Left[Let] = i;
	CurLetS = Let;
      }
      List->SuffixTree[cl].Right[Let] = i;
    }
  }
}

static void CheckSuffix(const int *word, size_t len, UDM_AFFIX *Affix, int *res, UDM_AGENT *Indexer, UDM_PSPELL *PS) {
  int newword[2*MAXNORMLEN] = {0};
  int err;
/*3.1  int curlang, curspellang;*/
#ifdef USE_PARANOIA
  void *paran = UdmViolationEnter();
#endif
  
  *res = UdmUniStrBNCmp(word, Affix->repl, Affix->replen);
  if (*res < 0) {
#ifdef USE_PARANOIA
    UdmViolationExit(paran);
#endif
    return;
  }
  if (*res > 0) {
#ifdef USE_PARANOIA
    UdmViolationExit(paran);
#endif
    return;
  }
  UdmUniStrCpy(newword, word);
  UdmUniStrCpy(newword+len-Affix->replen, Affix->find);

  if (Affix->compile) {
    err = UdmUniRegComp(&(Affix->reg), Affix->mask);
    if(err){
      UdmUniRegFree(&(Affix->reg));
#ifdef USE_PARANOIA
      UdmViolationExit(paran);
#endif
      return;
    }
    Affix->compile = 0;
  }
  if((err=UdmUniRegExec(&(Affix->reg),newword))){
    UDM_SPELL **curspell;

    if((curspell = UdmFindWord(Indexer, newword, Affix->flag, PS))) {

/*3.1 FIXME: language statistics collection while normalizing       
      curlang = Indexer->curlang;
      curspellang = Indexer->spellang;
      UdmSelectLang(Indexer, curspell->lang);
      Indexer->lang[Indexer->curlang].count++;
      Indexer->curlang = curlang;
      Indexer->spellang = curspellang;
*/
#ifdef USE_PARANOIA
      UdmViolationExit(paran);
#endif
      return;
    }
  }
#ifdef USE_PARANOIA
  UdmViolationExit(paran);
#endif
  return;
}


static int CheckPrefix(const int *word, UDM_AFFIX *Affix, UDM_AGENT *Indexer, int li, int pi, UDM_PSPELL *PS ) {
  int newword[2*MAXNORMLEN] = {0};
  int err, ls, rs, lres,rres, res;
  size_t newlen;
  UDM_AFFIX *CAffix = Indexer->Conf->Affixes.Affix;
#ifdef USE_PARANOIA
  void *paran = UdmViolationEnter();
#endif
  
  res = UdmUniStrNCaseCmp(word, Affix->repl, Affix->replen);
  if (res != 0) {
#ifdef USE_PARANOIA
    UdmViolationExit(paran);
#endif
    return res;
  }
  UdmUniStrCpy(newword, Affix->find);
  UdmUniStrCat(newword, word+Affix->replen);

  if (Affix->compile) {
    err = UdmUniRegComp(&(Affix->reg),Affix->mask);
    if(err){
      UdmUniRegFree(&(Affix->reg));
#ifdef USE_PARANOIA
      UdmViolationExit(paran);
#endif
      return (0);
    }
    Affix->compile = 0;
  }
  if((err=UdmUniRegExec(&(Affix->reg),newword))){
    UDM_SPELL **curspell;

    if((curspell = UdmFindWord(Indexer, newword, Affix->flag, PS))) {
    } 
    newlen = UdmUniLen(newword);
    ls = Indexer->Conf->Affixes.SuffixTree[li].Left[pi];
    rs = Indexer->Conf->Affixes.SuffixTree[li].Right[pi];
    while (ls >= 0 && ls <= rs) {
      CheckSuffix(newword, newlen, &CAffix[ls], &lres, Indexer, PS);
      if (rs > ls) {
	CheckSuffix(newword, newlen, &CAffix[rs], &rres, Indexer, PS);
      }
      ls++;
      rs--;
    }
  }
#ifdef USE_PARANOIA
  UdmViolationExit(paran);
#endif
  return 0;
}


__C_LINK UDM_SPELL ** __UDMCALL UdmNormalizeWord(UDM_AGENT * Indexer, UDM_WIDEWORD *wword){
	size_t len;
	UDM_SPELL **forms;
	UDM_SPELL **cur;
	UDM_AFFIX * Affix;
	int ri, pi, ipi, lp, rp, cp, ls, rs, nlang = -1 /*Indexer->spellang  FIXME: search form limit by lang  */ ;
	int li, li_from, li_to, lres, rres, cres = 0;
	int *uword = wword->uword;
	UDM_PSPELL PS;
	
	len=UdmUniLen(uword);
	if (len < Indexer->Conf->WordParam.min_word_len 
		|| len > MAXNORMLEN
		|| len > Indexer->Conf->WordParam.max_word_len
		)
		return(NULL);
	
	forms = (UDM_SPELL **) UdmXmalloc(MAX_NORM*sizeof(UDM_SPELL *));
	PS.cur = cur = forms; *cur=NULL;
	PS.nspell = 0;
	
	ri = (int)(*uword) & 255;
	pi = (int)(uword[UdmUniLen(uword)-1]) & 255;
	if (nlang == -1) {
	  li_from = 0; li_to = Indexer->Conf->Spells.nLang;
	} else {
	  li_from  = nlang;
	  li_to = nlang + 1;
	}
	Affix=(UDM_AFFIX*)Indexer->Conf->Affixes.Affix;
	
	/* Check that the word itself is normal form */
	UdmFindWord(Indexer, uword, 0, &PS);
		
	/* Find all other NORMAL forms of the 'word' */
	
	for (ipi = 0; ipi <= pi; ipi += pi ? pi : 1) {

	  for (li = li_from; li < li_to; li++) {
	    /* check prefix */
	    lp = Indexer->Conf->Affixes.PrefixTree[li].Left[ri];
	    rp = Indexer->Conf->Affixes.PrefixTree[li].Right[ri];
	    while (lp >= 0 && lp <= rp) {
	      cp = (lp + rp) >> 1;
	      cres = 0;
	      if (PS.nspell < (MAX_NORM-1)) {
		cres = CheckPrefix(uword, &Affix[cp], Indexer, li, ipi, &PS);
	      }
	      if ((lp < cp) && ((cur - forms) < (MAX_NORM-1)) ) {
		lres = CheckPrefix(uword, &Affix[lp], Indexer, li, ipi, &PS);
	      }
	      if ( (rp > cp) && ((cur - forms) < (MAX_NORM-1)) ) {
		rres = CheckPrefix(uword, &Affix[rp], Indexer, li, ipi, &PS);
	      }
	      if (cres < 0) {
		rp = cp - 1;
		lp++;
	      } else if (cres > 0) {
		lp = cp + 1;
		rp--;
	      } else {
		lp++;
		rp--;
	      }
	    }

	    /* check suffix */
	    ls = Indexer->Conf->Affixes.SuffixTree[li].Left[ipi];
	    rs = Indexer->Conf->Affixes.SuffixTree[li].Right[ipi];
	    while (ls >= 0 && ls <= rs) {
	      CheckSuffix(uword, len, &Affix[ls], &lres, Indexer, &PS);
	      if ( rs > ls ) {
		CheckSuffix(uword, len, &Affix[rs], &rres, Indexer, &PS);
	      }
	      ls++;
	      rs--;
	    } /* end while */
	  
	  } /* for li */
	} /* for ipi */

	if(PS.nspell == 0) {
		UDM_FREE(forms);
		return NULL;
	}

	return forms;
}



void UdmSpellListFree(UDM_SPELLLIST *List){
	size_t i;

	for ( i = 0; i < List->nspell; i++) {
		UDM_FREE(List->Spell[i].word);
	}
	UDM_FREE(List->Spell);
	List->nspell = 0;
}

void UdmAffixListFree (UDM_AFFIXLIST *List) {
	size_t i;

	for (i = 0; i < List->naffixes; i++)
		if (List->Affix[i].compile == 0)
			UdmUniRegFree(&(List->Affix[i].reg));

	UDM_FREE(List->Affix);
	List->naffixes = 0;
}


static void UdmAllFormsWord (UDM_AGENT *Indexer, UDM_SPELL *word, UDM_WIDEWORDLIST *result, size_t order) {
  UDM_WIDEWORD w;
  size_t i;
  size_t naffixes = Indexer->Conf->Affixes.naffixes;
  UDM_AFFIX *Affix = (UDM_AFFIX *)Indexer->Conf->Affixes.Affix;
  int err;
  UDM_CHARSET *local_charset;
  UDM_CHARSET *sys_int;
  UDM_CONV fromuni;
  
  local_charset = Indexer->Conf->lcs;
  if (local_charset == NULL) return;
  if (NULL==(sys_int=UdmGetCharSet("sys-int"))) return;
  UdmConvInit(&fromuni,sys_int,local_charset,UDM_RECODE_HTML);

#ifdef DEBUG_UNIREG
  printf("start AllFormsFord\n");
#endif
  
  w.word = NULL;
  w.uword = NULL;

    for (i = 0; i < naffixes; i++) {

      if ( (word->flag != NULL)
	  && (strchr(word->flag, Affix[i].flag) != NULL)
	   && (strcmp(word->lang, Affix[i].lang) == 0 )
	   ) {
	if (Affix[i].compile) {
	  err = UdmUniRegComp(&(Affix[i].reg), Affix[i].mask);
	  if(err){
	    UdmUniRegFree(&(Affix[i].reg));
	    return;
	  }
	  Affix[i].compile = 0;
	}

	err = UdmUniRegExec(&(Affix[i].reg), word->word);
	if ( err
	     && (err = (Affix[i].type == 'p') ? (UdmUniStrNCaseCmp(word->word, Affix[i].find, Affix[i].findlen) == 0) :
		 (UdmUniStrBNCmp(word->word, Affix[i].find, Affix[i].findlen) == 0)
		 )
	     )  {
	  
	  w.len = UdmUniLen(word->word) - Affix[i].findlen + Affix[i].replen;
	  if ( ( (w.word = UdmRealloc(w.word, 8 * w.len + 1)) == NULL) ||
	       ( (w.uword = UdmRealloc(w.uword, (w.len + 1) * sizeof(int))) == NULL)) 
	    return;

	  bzero((void*)w.uword, (w.len + 1) * sizeof(int));

	  if (Affix[i].type == 'p') {
	    UdmUniStrCpy(w.uword, Affix[i].repl);
	    UdmUniStrCat(w.uword, &(word->word[Affix[i].findlen]));
	  } else {
	    UdmUniStrNCpy(w.uword, word->word, UdmUniLen(word->word) - Affix[i].findlen);
	    UdmUniStrCat(w.uword, Affix[i].repl);
	  }
	  
	  UdmConv(&fromuni,w.word,8*w.len+1,(char*)w.uword,sizeof(int)*(w.len+1));
	  w.crcword = UdmStrHash32(w.word);
	  w.order = order;
	  w.count = 0;
	  w.origin = UDM_WORD_ORIGIN_SPELL;
	  UdmWideWordListAdd(result, &w);
	}

      }
    }
}


__C_LINK UDM_WIDEWORDLIST * __UDMCALL UdmAllForms (UDM_AGENT *Indexer, UDM_WIDEWORD *wword) {
  UDM_SPELL **norm, **cur;
  UDM_WIDEWORDLIST *result, *syn;
  UDM_WIDEWORD w;
  size_t i, j;
  UDM_CHARSET *local_charset;
  UDM_CHARSET *sys_int;
  UDM_CONV fromuni;
  int sy   = UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1);
  int sp   = UdmVarListFindInt(&Indexer->Conf->Vars, "sp", 1);
  UDM_PSPELL PS;
  
  PS.cur = NULL;
  local_charset = Indexer->Conf->lcs;
  if (local_charset == NULL) return NULL;
  if (NULL==(sys_int=UdmGetCharSet("sys-int"))) return NULL;
  UdmConvInit(&fromuni,sys_int,local_charset,UDM_RECODE_HTML);

  if ((result = UdmXmalloc(sizeof(UDM_WIDEWORDLIST))) == NULL) {
    return NULL;
  }
  w.word = NULL;
  w.uword = NULL;

  if ((PS.cur = (UDM_SPELL **) UdmXmalloc(MAX_NORM*sizeof(UDM_SPELL *))) == NULL) return NULL;
  PS.nspell = 0;
  UdmWideWordListInit(result);
  cur = norm = UdmNormalizeWord(Indexer, wword);

  if (cur != NULL) {
    while (*cur != NULL) {

      w.len = UdmUniLen((*cur)->word);
      if ( ( (w.word = UdmRealloc(w.word, 8 * w.len + 1)) == NULL) ||
	   ( (w.uword = UdmRealloc(w.uword, (w.len + 1) * sizeof(int))) == NULL)) 
	return NULL;
      UdmUniStrCpy(w.uword,(*cur)->word); 
      UdmConv(&fromuni,w.word,8*w.len+1,(char*)w.uword,sizeof(w.uword[0])*(w.len+1));
      w.crcword = UdmStrHash32(w.word);
      w.order = wword->order;
      w.count = 0;
      w.origin = UDM_WORD_ORIGIN_SPELL;
      if (sp) UdmWideWordListAdd(result, &w);

      syn = UdmSynonymListFind(&(Indexer->Conf->Synonyms), &w);

      if ((syn != NULL) && (sy))
	for(i = 0; i < syn->nwords; i++) {
	  UdmWideWordListAdd(result, &(syn->Word[i]));
	}
    
      if (sp) UdmAllFormsWord(Indexer, *cur, result, wword->order);
      if ((syn != NULL) && (sy)) {
	for(i = 0; i < syn->nwords; i++) {
	  PS.nspell = 0;
	  UdmFindWord(Indexer, syn->Word[i].uword, 0, &PS);
	  for (j = 0; PS.cur[j] != NULL; j++) 
	    UdmAllFormsWord(Indexer, PS.cur[j], result, wword->order);
	}
      }
      cur++;
    }
  } else {
    /*UdmWideWordListAdd(result, wword);*/
    syn = UdmSynonymListFind(&(Indexer->Conf->Synonyms), wword);

    if ((syn != NULL) && (sy)) {
      for(i = 0; i < syn->nwords; i++) {
	UdmWideWordListAdd(result, &(syn->Word[i]));
      }
    
      for(i = 0; i < syn->nwords; i++) {
	PS.nspell = 0;
	UdmFindWord(Indexer, syn->Word[i].uword, 0, &PS);
	for (j = 0; PS.cur[j] != NULL; j++) 
	  UdmAllFormsWord(Indexer, PS.cur[j], result, wword->order);
      }
    }
  }
  UDM_FREE(norm);
  UDM_FREE(PS.cur);

  return result;
}
