/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"

#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
#endif
#include <sys/stat.h>
#include <stdio.h>
#include <errno.h>
#include <math.h>

#include "udm_common.h"
#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_uniconv.h"
#include "udm_searchtool.h"
#include "udm_boolean.h"
#include "udm_xmalloc.h"
#include "udm_spell.h"
#include "udm_stopwords.h"
#include "udm_word.h"
#include "udm_vars.h"
#include "udm_db.h"
#include "udm_db_int.h"
#include "udm_url.h"
#include "udm_hash.h"
#include "udm_parsehtml.h"
#include "udm_store.h"
#include "udm_doc.h"
#include "udm_conf.h"
#include "udm_result.h"
#include "udm_log.h"
#include "udm_sgml.h"
#include "udm_mutex.h"
#include "udm_chinese.h"
#include "udm_synonym.h"

#ifdef CHASEN
#include <chasen.h>
#endif

#ifdef MECAB
#include <mecab.h>
#endif

/*
#define DEBUG_CACHE
*/

typedef struct {
  size_t position;
  size_t order;
} UDM_PHR_DAT;


/********** QSORT functions *******************************/

static int cmpword(UDM_URL_CRD *s1,UDM_URL_CRD *s2)
{
  if (s1->coord > s2->coord) return -1;
  if (s1->coord < s2->coord) return 1;
  if (s1->url_id > s2->url_id) return 1;
  if (s1->url_id < s2->url_id) return -1;
  return 0;
}

static int cmpurlid (UDM_URL_CRD *s1, UDM_URL_CRD *s2)
{
  if (s1->url_id > s2->url_id) return(1);
  if (s1->url_id < s2->url_id) return(-1);
  return(UDM_WRDPOS(s1->coord) - UDM_WRDPOS(s2->coord));
}

static int cmpsiteid(UDM_URLCRDLIST *L, UDM_URL_CRD *C, UDM_URLDATA *D, long j)
{
  if (D->site_id > L->Data[j].site_id) return 1;
  if (D->site_id < L->Data[j].site_id) return -1;
  if (C->coord > L->Coords[j].coord) return 1;
  if (C->coord < L->Coords[j].coord) return -1;
  if (D->pop_rank > L->Data[j].pop_rank) return 1;
  if (D->pop_rank < L->Data[j].pop_rank) return -1;
  return 0;
}

static int cmppattern(UDM_URLCRDLIST *L, UDM_URL_CRD *C, UDM_URLDATA *D,
                      long j, const char *pattern)
{
  int rc;

  for(; *pattern != '\0'; pattern++)
  {
    switch(*pattern)
    {
      case 'R':
      case 'r':
        if (C->coord > L->Coords[j].coord) return (*pattern == 'R') ? 1 : -1;
        if (C->coord < L->Coords[j].coord) return (*pattern == 'R') ? -1 : 1;
        break;
      case 'P':
      case 'p':
        if (D->pop_rank > L->Data[j].pop_rank) return (*pattern == 'P') ? 1 : -1;
        if (D->pop_rank < L->Data[j].pop_rank) return (*pattern == 'P') ? -1 : 1;
        break;
      case 'D':
      case 'd':
        if (D->last_mod_time > L->Data[j].last_mod_time) return (*pattern == 'D') ? 1 : -1;
        if (D->last_mod_time < L->Data[j].last_mod_time) return (*pattern == 'D') ? -1 : 1;
        break;
      case 'U':
      case 'u':
        rc= strcmp(UDM_NULL2EMPTY(D->url), UDM_NULL2EMPTY(L->Data[j].url));
        if (rc) return(*pattern == 'U' ? -rc : rc);
        break;
      case 'S':
      case 's':
        rc= strcmp(UDM_NULL2EMPTY(D->section),
                   UDM_NULL2EMPTY(L->Data[j].section));
        if (rc) return(*pattern == 'S' ? -rc : rc);
        break;
    }
  }
  return 0;
}

static int cmpphr(UDM_PHR_DAT *p1, UDM_PHR_DAT *p2)
{
  if (p1->position < p2->position) return -1;
  if (p1->position > p2->position) return 1;
  return 0;
}

/****************************************************/

void UdmSortSearchWordsByWeight(UDM_URL_CRD *wrd,size_t num)
{
  if (wrd)UdmSort((void*)wrd,num,sizeof(*wrd),(qsort_cmp)cmpword);
  return;
}

void UdmSortSearchWordsByURL(UDM_URL_CRD *wrd,size_t num)
{
  if(wrd)UdmSort((void*)wrd,num,sizeof(*wrd),(qsort_cmp)cmpurlid);
  return;
}

static size_t UdmH[] = {1, 5, 19, 41, 109, 209, 505, 929, 2161,
                        3905, 8929, 16001, 36289, 64769};

void UdmSortSearchWordsBySite(UDM_URLCRDLIST *L, size_t num)
{
  register ssize_t h, i, j;
  int s = 13;
  UDM_URL_CRD Crd;
  UDM_URLDATA Dat;

  while((s > 0) && ((num / 3) < UdmH[s])) s--;
  
  while(s >= 0)
  {
    h = UdmH[s];
    for (j = h; j < (ssize_t)num; j++)
    {
      Crd = L->Coords[j];
      Dat = L->Data[j];

      i = j - h;
DD4:
      if (cmpsiteid(L, &Crd, &Dat, i) <= 0) goto DD6;
      L->Coords[i + h] = L->Coords[i];
      L->Data[i + h] = L->Data[i];
      i -= h;
      if (i >= 0) goto DD4;

DD6:
      L->Coords[i + h] = Crd;
      L->Data[i + h] = Dat;
    }
    s--;
  }
  return;
}


void UdmSortSearchWordsByPattern(UDM_RESULT *Res, UDM_URLCRDLIST *L,
                                 size_t num, const char *pattern)
{
  register ssize_t h, i, j;
  int s = 13;
  UDM_URL_CRD Crd;
  UDM_URLDATA Dat;
  size_t Cnt = 1;
  
  while( (s > 0) && ((num / 3) < UdmH[s])) s--;
  while(s >= 0)
  {
    h = UdmH[s];
    for (j = h; j < (ssize_t)num; j++)
    {
      Crd = L->Coords[j];
      Dat = L->Data[j];
      if (Res->PerSite) Cnt = Res->PerSite[j];

      i = j - h;
D4:
      if (cmppattern(L, &Crd, &Dat, i, pattern) <= 0) goto D6;
      L->Coords[i + h] = L->Coords[i];
      L->Data[i + h] = L->Data[i];
      if (Res->PerSite) Res->PerSite[i + h] = Res->PerSite[i];
      i -= h;
      if (i >= 0) goto D4;

D6:
      L->Coords[i + h] = Crd;
      L->Data[i + h] = Dat;
      if (Res->PerSite) Res->PerSite[i + h] = Cnt;
    }
    s--;
  }
  return;
}


/*#define DEBUG_TOP_SORT*/

/* Find topcount best results */
void UdmWrdTopSort(UDM_URL_CRD *wrd, size_t nwrd,size_t topcount)
{
  size_t j;
  UDM_URL_CRD w;
  
#ifdef DEBUG_TOP_SORT
  fprintf(stderr,"top+1=%d nwrd=%d\n",topcount+1,nwrd);
#endif
  
  UdmSortSearchWordsByWeight(wrd,topcount+1);
  for(j=topcount;j<nwrd;j++)
  {
    register int res;
    if (wrd[j].coord > wrd[topcount].coord) res = 1;
    else if (wrd[j].coord < wrd[topcount].coord) res = -1;
    else
/*    if(!(res=(wrd[j].coord-wrd[topcount].coord)))*/
      res = (wrd[topcount].url_id - wrd[j].url_id);
    
#ifdef DEBUG_TOP_SORT
fprintf(stderr,"(%d,%d) %d (%d,%d) %d\n",
    wrd[topcount].coord,wrd[topcount].url_id,topcount,
    wrd[j].coord,wrd[j].url_id,j);
#endif

    if(res>0)
    {
      size_t l,c,r;
      
      l=0;r=topcount;
      while(l<r)
      {
        c=(l+r)/2;
        if (wrd[c].coord > wrd[j].coord) res = 1;
        else if (wrd[c].coord < wrd[j].coord) res = -1;
        else
/*        if(!(res=(wrd[c].coord-wrd[j].coord)))*/
          res = (wrd[j].url_id - wrd[c].url_id);
        
        if(res>0)
        {
          l=c+1;
        }
        else
        {
          r=c;
        }
      }
      w=wrd[topcount];
      memmove(&wrd[r+1],&wrd[r],(topcount-r)*sizeof(*wrd));
      wrd[r]=wrd[j];
      wrd[j]=w;
    }
  }
}


#define UDM_MAX_FORMS 256
#define UDM_MAX_NORMS 64

/*
  All the following combinations should
  work and get as many uword forms as possible:

  1. uword doesn't exist in ispell, its synonym doesn't exist in ispell.
     This last combination should also work if no ispell dictionaries loaded.
     Just copy all synonyms into result.
  2. DONE: both norm(uword) and its synonym exist in ispell
  3. norm(uword) exists in ispell, its synonym doesn't exist in ispell.
  4. uword doesn't exist in ispell, its synonym exists in ispell.
*/

static UDM_WIDEWORDLIST *UdmAllForms1(UDM_AGENT *Indexer,
                                      UDM_WIDEWORDLIST *result,
                                      const UDM_WIDEWORD *uword)
{
  UDM_SPELLLISTLIST *SLL= &Indexer->Conf->Spells;
  UDM_AFFIXLISTLIST *ALL= &Indexer->Conf->Affixes;
  UDM_SYNONYMLIST   *SYN= &Indexer->Conf->Synonyms;
  char *Res[UDM_MAX_FORMS];
  char **ResCur= Res;
  char **ResEnd= Res + UDM_MAX_FORMS;
  char **R;
  UDM_AFFIXLIST *Al;  
  UDM_WIDEWORD w;
  UDM_CONV lcs_uni;
  UDM_CHARSET *lcs= Indexer->Conf->lcs;
  int sy= UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1);
  int sp= UdmVarListFindInt(&Indexer->Conf->Vars, "sp", 1);

  if (!sp)
    return NULL;

  for (Al= ALL->Item; Al < &ALL->Item[ALL->nitems]; Al++)
  {
    UDM_SPELLLIST *Sl;
    for (Sl= SLL->Item; Sl < &SLL->Item[SLL->nitems]; Sl++)
    {
      if (!strcasecmp(Al->lang, Sl->lang) && !strcasecmp(Al->cset, Sl->cset))
      {
        UDM_SPELL Norm[UDM_MAX_NORMS];
        UDM_SPELL *NormEnd= Norm + UDM_MAX_NORMS;
        UDM_SPELL *NormCur= Norm;
        UDM_SPELL *N;
        char tmp[256];
        char *word= uword->word;
        
        if (lcs != Sl->cs)
        {
          UDM_CONV lcs_scs;
          size_t len= strlen(word);
          UdmConvInit(&lcs_scs, lcs, Sl->cs, UDM_RECODE_HTML);
          UdmConv(&lcs_scs, tmp, sizeof(tmp), word, len + 1);
          word= tmp;
        }
        
        NormCur+= UdmSpellNormalize(Sl, Al, word, NormCur, NormEnd-NormCur);
        
        if (sy && SYN->nsynonyms)
        {
          UDM_CONV scs_uni, uni_scs;
          UDM_WIDEWORD ww;
          UDM_WIDEWORDLIST *syn;
          int u[128];
          ww.uword= u;
          UdmConvInit(&scs_uni, Sl->cs, &udm_charset_sys_int, UDM_RECODE_HTML);
          UdmConvInit(&uni_scs, &udm_charset_sys_int, Sl->cs, UDM_RECODE_HTML);
          /* 
            Find synonyms for each normal form
            and add the found synonyms into normalized
            list for futher denormalization.
          */
          for (N= Norm; N < NormCur; N++)
          {
            UdmConv(&scs_uni,(char*)&u,sizeof(u),N->word,strlen(N->word)+1);
            if ((syn= UdmSynonymListFind(SYN, &ww)))
            {
              UDM_WIDEWORD *W;
              for (W= syn->Word; W < syn->Word + syn->nwords; W++)
              {
                size_t ubytes= (W->ulen + 1) * sizeof(int);
                UdmConv(&uni_scs, tmp, sizeof(tmp), (char*) W->uword, ubytes);
                if (NormCur < NormEnd)
                {
                  NormCur+= UdmSpellNormalize(Sl, Al, tmp, NormCur, NormEnd-NormCur);
                }
              }
              UdmWideWordListFree(syn);
            }
          }
        }
        
        for (N= Norm ; N < NormCur; N++)
        {
          if (ResCur < ResEnd)
          {
            size_t cres= 1;
            *ResCur= UdmStrdup(N->word);
            cres+= UdmSpellDenormalize(Sl, Al, N, ResCur+1, ResEnd-ResCur-1);
            if (lcs != Sl->cs)
            {
              size_t i;
              UDM_CONV scs_lcs;
              UdmConvInit(&scs_lcs, Sl->cs, lcs, UDM_RECODE_HTML);
              for (i=0; i < cres; i++)
              {
                UdmConv(&scs_lcs, tmp, sizeof(tmp),
                        ResCur[i], strlen(ResCur[i])+1);
                UdmFree(ResCur[i]);
                ResCur[i]= UdmStrdup(tmp);
              }
            }
            ResCur+= cres;
          }
        }
      }
    }
  }
  
  UdmConvInit(&lcs_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  
  bzero((void*)&w, sizeof(w));
  for (R=Res; R < ResCur; R++)
  {
    size_t nbytes;
    w.order= uword->order;
    w.count= 0;
    w.origin= UDM_WORD_ORIGIN_SPELL;
    w.word= *R;
    w.len= strlen(w.word);
    nbytes= (w.len + 1) * sizeof(int);
    w.uword= UdmRealloc(w.uword, nbytes);
    w.ulen= UdmConv(&lcs_uni, (char*) w.uword, nbytes, w.word, w.len + 1);
    UdmWideWordListAdd(result, &w);
    UdmFree(*R);
  }
  UdmFree(w.uword);


  return result;
}


static UDM_WIDEWORDLIST *UdmAllForms(UDM_AGENT *Indexer,
                                     UDM_WIDEWORDLIST *result,
                                     UDM_WIDEWORD *uword)
{
  UDM_WIDEWORDLIST *uwordsyn;
  
  /*
    Generate all possible word forms for uword.
  */
  UdmAllForms1(Indexer, result, uword);
  
  if (!UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1))
    return result;
  /*
     Combination one: uword is possibly a normalized form.
     Find all uword synonyms and then process then through
     ispell to generate all word forms for the synonyms.
  */
  if ((uwordsyn= UdmSynonymListFind(&Indexer->Conf->Synonyms, uword)))
  {
    UDM_WIDEWORD *ww;
    UDM_CONV uni_lcs;
    UdmConvInit(&uni_lcs, &udm_charset_sys_int, Indexer->Conf->lcs, UDM_RECODE_HTML); 

    for (ww= uwordsyn->Word; ww < &uwordsyn->Word[uwordsyn->nwords]; ww++)
    {
      char tmp[256];
      if (!UdmUniStrCmp(uword->uword, ww->uword))
        continue;
      ww->len= UdmConv(&uni_lcs, tmp, sizeof(tmp),
                       (char*) ww->uword, ww->ulen*sizeof(int));
      ww->word= tmp;
      ww->word[ww->len]= '\0';
      UdmWideWordListAdd(result, ww);
      UdmAllForms1(Indexer, result, ww);
      ww->len= 0;
      ww->word= NULL;
    }
    UdmWideWordListFree(uwordsyn);
  }
  return result;
}

int UdmPrepare(UDM_AGENT * query,UDM_RESULT *Res)
{
  UDM_CHARSET * browser_cs, * local_cs, *sys_int;
  int  ctype;
  int * ustr, * lt, * lex;
  size_t ulen;
  int word_match   = UdmMatchMode(UdmVarListFindStr(&query->Conf->Vars, "wm", "wrd"));
/*int search_mode = UdmSearchMode(UdmVarListFindStr(&query->Conf->Vars, "m", "all")); */
  const char * txt = UdmVarListFindStr(&query->Conf->Vars,"q","");
  const char * qprev = UdmVarListFindStr(&query->Conf->Vars,"qprev","");
  const char * qlang = UdmVarListFindStr(&query->Conf->Vars, "g", NULL);
  char *ltxt;
  size_t i, wlen, llen, nphrasecmd = 0;
  char *wrd, *clex;
  int *uwrd;
  UDM_CONV uni_lc, bc_uni, bc_lc;
  const char *lang;
  
  if ((wrd = (char*)UdmMalloc(query->Conf->WordParam.max_word_len * 12 + 1)) == NULL) return 0;
  if ((uwrd = (int*)UdmMalloc(sizeof(int) * (query->Conf->WordParam.max_word_len + 1))) == NULL) { UDM_FREE(wrd); return 0; }


  if (!(browser_cs = query->Conf->bcs))
    browser_cs=UdmGetCharSet("iso-8859-1");
  
  if(!(local_cs = query->Conf->lcs))
    local_cs=UdmGetCharSet("iso-8859-1");
  
  sys_int= &udm_charset_sys_int;
  
  UdmConvInit(&bc_uni,browser_cs,sys_int,UDM_RECODE_HTML);
  UdmConvInit(&uni_lc,sys_int,local_cs,UDM_RECODE_HTML);
  UdmConvInit(&bc_lc,browser_cs,local_cs,UDM_RECODE_HTML);
  
  ulen=strlen(txt);
  ustr=(int*)(UdmMalloc((sizeof(int))*(ulen+1)));
  UdmConv(&bc_uni,(char*)ustr,sizeof(ustr[0])*(ulen+1),txt,ulen+1);
  
  /* Create copy of query, converted into LocalCharset (for UdmTrack) */
  llen = ulen * 14 + 1;
  ltxt=(char*)UdmMalloc(llen);
  UdmConv(&uni_lc,ltxt,llen,(char*)ustr,bc_uni.obytes);
  ltxt[uni_lc.obytes]='\0';
  UdmVarListReplaceStr(&query->Conf->Vars,"q",ltxt);  /* "q-lc" was here */
  UDM_FREE(ltxt);
  
  llen = strlen(qprev);
  ltxt=(char*)UdmMalloc(llen*14+1);
  UdmConv(&bc_lc,ltxt,llen*14+1,qprev,llen);
  ltxt[bc_lc.obytes]='\0';
  UdmVarListReplaceStr(&query->Conf->Vars,"qprev",ltxt);
  UDM_FREE(ltxt);
  
  /* Parse query and build boolean search stack*/
  UdmUniStrToLower(ustr);
  switch(browser_cs->family)
  {
    case UDM_CHARSET_CHINESE_SIMPLIFIED:
    case UDM_CHARSET_CHINESE_TRADITIONAL: lang = "zh"; break;
    case UDM_CHARSET_JAPANESE: lang = "ja"; break;
    case UDM_CHARSET_THAI: lang = "th"; break;
    default: lang = "";
  }
  ustr = UdmUniSegment(query, ustr, lang);

  lex = UdmUniGetSepToken(ustr, &lt , &ctype);
  for ( ;lex; lex= UdmUniGetSepToken(NULL, &lt, &ctype))
  {
    wlen=lt-lex;
    memcpy(uwrd, lex, (udm_min(wlen, query->Conf->WordParam.max_word_len)) * sizeof(int));
    uwrd[udm_min(wlen, query->Conf->WordParam.max_word_len)] = 0;
    UdmConv(&uni_lc, wrd, query->Conf->WordParam.max_word_len * 12,(char*)uwrd, sizeof(uwrd[0])*(wlen+1));
    clex = UdmTrim(wrd, " \t\r\n");
      
    if ((ctype != UDM_UNI_LETTER) && (ctype != UDM_UNI_CJK))
    {
      for (i = 0; i < wlen; i++)
      {
         switch(lex[i])
         {
           case '&':
           case '+':
             Res->items[Res->nitems].cmd = UDM_STACK_AND;
             break;
           case '|':
             Res->items[Res->nitems].cmd = UDM_STACK_OR;
             break;
           case '~':
             Res->items[Res->nitems].cmd = UDM_STACK_NOT;
             break;
           case '(':
             Res->items[Res->nitems].cmd = UDM_STACK_LEFT;
             break;
           case ')':
             Res->items[Res->nitems].cmd = UDM_STACK_RIGHT;
             break;
           case '"':
             Res->items[Res->nitems].cmd = UDM_STACK_PHRASE;
             nphrasecmd++;
             break;
           default: continue;
         }
	 if (! (nphrasecmd % 2) || Res->items[Res->nitems].cmd == UDM_STACK_PHRASE)
	 {
           Res->items[Res->nitems].arg=0;
           Res->nitems++;
           Res->ncmds++;
           if (Res->nitems >= Res->mitems)
           {
             Res->mitems += UDM_MAXSTACK;
             Res->items = (UDM_STACK_ITEM*)UdmRealloc(Res->items, Res->mitems * sizeof(UDM_STACK_ITEM));
           }
	 }
      }
    } 
    else
    {
      UDM_WIDEWORD OWord;
      UDM_WIDEWORDLIST Forms;
      int origin;

      if(Res->WWList.nuniq >= UDM_MAXWORDPERQUERY-1)
        continue;

      Res->items[Res->nitems].cmd=UDM_STACK_WORD;
      Res->items[Res->nitems].arg = Res->WWList.nuniq;  /* 1L << (Res->WWList.nuniq); */
      Res->nitems++;
      if (Res->nitems >= Res->mitems)
      {
        Res->mitems += UDM_MAXSTACK;
        Res->items = (UDM_STACK_ITEM*)UdmRealloc(Res->items, Res->mitems * sizeof(UDM_STACK_ITEM));
      }

      {
        size_t nw;
        for (nw = 0; nw < Res->WWList.nwords; nw++)
        {
          if (UdmUniStrCmp(Res->WWList.Word[nw].uword, uwrd)) continue;
            continue;
        }
      }

      /*
        Check stopword only when full word.
        Substring searches should not exclude them.
      */
      if(word_match == UDM_MATCH_FULL &&
         (UdmStopListFind(&query->Conf->StopWords, wrd, qlang) ||
          query->Conf->WordParam.min_word_len > wlen ||
          query->Conf->WordParam.max_word_len < wlen))
      {
        origin= UDM_WORD_ORIGIN_STOP;
        Res->items[Res->nitems - 1].cmd= UDM_STACK_STOP;
      }
      else
      {
        origin= UDM_WORD_ORIGIN_QUERY;
      }

      OWord.len= strlen(wrd);
      OWord.order= Res->WWList.nuniq;
      OWord.count= 0;
      OWord.word= wrd;
      OWord.uword= uwrd;
      OWord.origin = origin;
      UdmWideWordListAdd(&Res->WWList, &OWord);
      
      if (origin == UDM_WORD_ORIGIN_STOP)
        continue;
      
      UdmWideWordListInit(&Forms);
      if(UdmAllForms(query,&Forms,&OWord))
      {
        UDM_WIDEWORD FWord;
        size_t frm;
        for (frm= 0; frm < Forms.nwords ; frm++)
        {
          UdmConv(&uni_lc,wrd,12*query->Conf->WordParam.max_word_len,
                  (char*)(Forms.Word[frm].uword),
                   sizeof(Forms.Word[frm].uword[0])*(UdmUniLen(Forms.Word[frm].uword)+1));
          FWord.len= strlen(wrd);
          FWord.order= Res->WWList.nuniq;
          FWord.count= 0;
          FWord.word= wrd;
          FWord.uword= Forms.Word[frm].uword;
          FWord.origin = Forms.Word[frm].origin;

          UdmWideWordListAdd(&Res->WWList,&FWord);
/*        UdmLog(query, UDM_LOG_DEBUG, "Word form: [%d] %s", FWord.origin, wrd);*/
        }
      }
      UdmWideWordListFree(&Forms);
      Res->WWList.nuniq++;
    }
  }
  
  if (nphrasecmd & 1)
  {
    Res->items[Res->nitems].cmd = UDM_STACK_PHRASE;
    Res->items[Res->nitems].arg=0;
    Res->nitems++;
    Res->ncmds++;
    if (Res->nitems >= Res->mitems)
    {
      Res->mitems += UDM_MAXSTACK;
      Res->items = (UDM_STACK_ITEM*)UdmRealloc(Res->items, Res->mitems * sizeof(UDM_STACK_ITEM));
    }
  }
  UDM_FREE(ustr); UDM_FREE(uwrd); UDM_FREE(wrd);
  Res->WWList.wm = word_match;
    
  return(0);
}

/*
  R[i] and D[i] are in the range 0..64.
  ns is between 1..256
*/
static
inline uint4 UdmCalcCosineWeight(unsigned int *R, unsigned int *D, size_t num,
                                 float Rsum_factor, float nwords_factor)
{
  size_t i;
  float res;

  size_t Dsum=  D[0] * D[0];
  size_t RDsum= R[0] * D[0];

  for (i=1; i < num; i++)
  {
    if (D[i])
    {
      Dsum+=  D[i] * D[i];
      RDsum+= R[i] * D[i];
    }
  }
  
  res= Rsum_factor * nwords_factor * (float) RDsum / sqrt(Dsum) + 0.5;
  
#if 0
  fprintf(stderr,
          "nw=%d ns=%d R: %d  D: %d  RD: %d cos: %d\n",
          nw, ns, Rsum, Dsum, RDsum, (uint4) res);
#endif
  
  return (uint4) res;
}

static int UdmOriginWeight(int origin)
{
  switch(origin)
  {
    case UDM_WORD_ORIGIN_QUERY: return 3;
    case UDM_WORD_ORIGIN_SPELL: return 1;
  }
  return 0;
}


static unsigned int CalcAverageWordDistance(UDM_PHR_DAT *phr, size_t num)
{
  size_t i, sum= 0, np= 0;

  if (num < 2)
    return 0;
  
  if (num == 2)
  {
    return phr[0].order == phr[1].order ? 0 :
             phr[1].position > phr[0].position ? 
             phr[1].position - phr[0].position :
             phr[0].position - phr[1].position;
  }
  
  UdmSort((void*)phr, num, sizeof(UDM_PHR_DAT), (qsort_cmp)cmpphr);
  for (i = 1; i < num-1; i++)
  {
    if (phr[i-1].order == phr[i].order)
    {
      if (phr[i].order == phr[i+1].order)
      {
        /* w1 w1 w1 */
      }
      else
      {
        /* w1 w1 w2 */
      }
    }
    else
    {
      if (phr[i].order == phr[i+1].order)
      {
        /* w1 w2 w2 */
      }
      else
      {
        /* w1 w2 w1 */
        size_t diff1= phr[i].position - phr[i-1].position;
        size_t diff2= phr[i+1].position - phr[i].position;
        sum+= diff1 < diff2 ? diff1 : diff2;
        np++;
      }
    }
  }
  return (np) ? (sum / np) : 0;
}


static
inline void CheckPhrase(UDM_STACK_ITEM *temp_items, size_t nitems,
                        UDM_PHR_DAT *phr,  size_t phr_n,  size_t *count)
{
  size_t z, pharg, ph_start, ph_end;
  for(z = 0; z < nitems - 1; z++)
  {
    if (temp_items[z].cmd == UDM_STACK_PHRASE)
    {
      size_t y;
      /*
         we assume correct stack at this moment,
         i.e. no empty phrases nor illegal operations order
       */
      pharg = 0;
      ph_start = z + 1;
      for (ph_end = ph_start; ph_end < nitems && (temp_items[ph_end].cmd == UDM_STACK_WORD || temp_items[ph_end].cmd == UDM_STACK_STOP); ph_end++);
      if (phr_n >= (ph_end - ph_start))
      {
        for (y = 0; y <= phr_n - (ph_end - ph_start); y++)
        {
          if (temp_items[ph_start].arg == phr[y].order)
          {
            UDM_PHR_DAT *prev = &phr[y];
            size_t x;
            size_t d = y + 1;
            size_t delta = 1;
            pharg = 1;

            for (x = ph_start + 1; x < ph_end; x++)
            {
              if (temp_items[d].cmd == UDM_STACK_STOP)
              {
                delta++;
                continue;
              }

              while (d < phr_n &&
                     (prev->position == phr[d].position ||
                      (prev->position + delta == phr[d].position &&
                       phr[d].order != temp_items[x].arg))) d++;

              if (d == phr_n ||
                  prev->position != phr[d].position - delta ||
                  temp_items[x].arg != phr[d].order)
              {
                pharg = 0;
                break;
              }
              delta = 1;
              prev = &phr[d];
              d++;
            }
            if (pharg) break;
          }
        }
      }
      for (y = ph_start; y < ph_end; y++) count[temp_items[y].arg] = pharg;
      for (z++; z < nitems && (temp_items[z].cmd != UDM_STACK_PHRASE); z++);
    }
  }
}

static inline void CheckPhrase1 (
       UDM_STACK_ITEM *query, size_t nitems,
       UDM_PHR_DAT *coords, size_t ncoords,
       size_t *count)
{
  size_t q;
  size_t start, end, arg, i;
  size_t rstart, rend;
  size_t j, d, delta;
  UDM_PHR_DAT *prev;

  /* find opening phrase command */
  for (q= 0; q < nitems; q++)
  {
    if (query[q].cmd != UDM_STACK_PHRASE) continue;

    /* find closing phrase command */
    start= q + 1;
    for (end= start; end < nitems && query[end].cmd != UDM_STACK_PHRASE; end++);
    q= end;
    arg= 0;

    /* skip trailing stopwords for now */
    /* TODO: we have to check document length (for phrases like "word1 stopword1") */
    for (rstart= start; rstart < end && query[rstart].cmd == UDM_STACK_STOP; rstart++);
    for (rend= end; rend > rstart && query[rend].cmd == UDM_STACK_STOP; rend--);

    /* if phrase contains stopwords only, we assume this document is found */
    if (rstart == rend) arg= 1;
    else for (i= 0; i < ncoords; i++)
    {
      if (query[rstart].arg != coords[i].order) continue;
      prev = &coords[i];
      d= i + 1;
      delta= 1;

      for (j= rstart + 1; j < rend; j++)
      {
        if (query[j].cmd == UDM_STACK_STOP)
        {
          delta++;
          continue;
        }

        /* find coord for this word */
        while (d < ncoords &&
               (prev->position == coords[d].position ||
                (prev->position + delta == coords[d].position &&
                 coords[d].order != query[j].arg))) d++;

        if (d == ncoords ||
            prev->position != coords[d].position - delta ||
            query[j].arg != coords[d].order) break;

        delta= 1;
        prev= &coords[d];
      }
      if (j == rend)
      {
        arg= 1;
        break;
      }
    }

    for (i= rstart; i < rend; i++)
      if (query[i].cmd == UDM_STACK_WORD) count[query[i].arg]= arg;
  }
}

/*
#define DEBUG_REL 1
*/

static void UdmGroupByURLNewRel(UDM_RESULT *Res)
{
  urlid_t prev_id;
  size_t prev_num= 0;
  size_t N= 0;
  size_t M= Res->WWList.nuniq;
  size_t ndoc= 0; 
  size_t i;
  UDM_URL_CRD *Crd = Res->CoordList.Coords;
  float  TFi[UDM_MAXWORDPERQUERY+1];
  float   Ci[UDM_MAXWORDPERQUERY+1];
  float   Vi[UDM_MAXWORDPERQUERY+1];    
  float   Wi[UDM_MAXWORDPERQUERY+1];
  float   Di[UDM_MAXWORDPERQUERY+1];
    
  bzero(TFi, sizeof(TFi));
  bzero(Ci, sizeof(Ci));
  bzero(Vi, sizeof(Vi));
  bzero(Di, sizeof(Di));
  
  
  /* N - number of documents */
  for (prev_id= Crd[0].url_id, i=0; i<= Res->CoordList.ncoords; i++)
  {
    if ( (i==Res->CoordList.ncoords) || (prev_id != Crd[i].url_id))
    {
      N++;
      if (i < Res->CoordList.ncoords)
      {
        prev_id= Crd[i].url_id;
        prev_num= i;
      }
    }

#ifdef DEBUG_REL
    if (i < Res->CoordList.ncoords)
    {
      fprintf(stderr,"[%d]doc=%d sec=%d wrd=%d pos=%d\n",
              i,Crd[i].url_id,
              UDM_WRDSEC(Crd[i].coord),
              UDM_WRDNUM(Crd[i].coord),
              UDM_WRDPOS(Crd[i].coord));
    }
#endif
  }
    
    
  /* TFi */
  for (i=0; i < Res->CoordList.ncoords; i++)
  {
    TFi[UDM_WRDNUM(Crd[i].coord)]++;
  }
    
  /* Add fictious word */
  TFi[M]= ((float) Res->CoordList.ncoords) / M;
  M++;
    
    
    
  /* Ci */
  for (i=0; i < M; i++)
  {
     Ci[i]= ((float) TFi[i]) / N;
#ifdef DEBUG_REL
     fprintf(stderr,"[%d] TDi=%d Ci=%f\n",i,TFi[i], Ci[i]);
#endif
  }
    
    
  /* Vi */
  for (prev_id= Crd[0].url_id, i=0; i<= Res->CoordList.ncoords; i++)
  {
    size_t k;
      
    if ( (i==Res->CoordList.ncoords) || (prev_id != Crd[i].url_id))
    {
      Di[M-1]= TFi[M-1]/N; /* fictious word */
      for (k=0; k < M; k++)
      {
        Vi[k]+= (Di[k] - Ci[k]) * (Di[k] - Ci[k]);
      }
      bzero(Di, sizeof(Di));
        
      if (i < Res->CoordList.ncoords)
      {
        prev_id= Crd[i].url_id;
        prev_num= i;
      }
    }
    if (i < Res->CoordList.ncoords)
      Di[UDM_WRDNUM(Crd[i].coord)]++;
  }
    
    
  for (i=0; i < M; i++)
  {
    Vi[i]/= (N-1);
    Wi[i]= Ci[i] ? TFi[i]*Vi[i]/Ci[i]/Ci[i] : 0;
#ifdef DEBUG_REL
    fprintf(stderr,"[%d] Vi=%f Wi=%f\n", i, Vi[i],Wi[i]);
#endif
  }
    
    
  bzero(Di,sizeof(Di));
  for (prev_id= Crd[0].url_id, i=0; i<= Res->CoordList.ncoords; i++)
  {
    size_t k;
    
    if ( (i==Res->CoordList.ncoords) || (prev_id != Crd[i].url_id))
    {
      float sumWiDi= 0;
      float sumDi2= 0;
      float sumWi2= 0;
      float res;
      size_t n= 1;
        
      Di[M-1]= TFi[M-1]/N;  /* fictious word */
       
      for (k=0; k < M; k++)
      {
        sumWiDi+= Wi[k] * Di[k] / (float)M;
        sumDi2+=  Di[k] * Di[k] / (float)M;
        sumWi2+=  Wi[k] * Wi[k];
        if (k + 1 < M)
          n*= Di[k];
      }
        
      res= sumWiDi / sqrt(sumDi2 * sumWi2);
      Crd[ndoc].url_id= prev_id;
      Crd[ndoc].coord= res * 100000;
        
#ifdef DEBUG_REL
      fprintf(stderr,"RES[%d]=%f %f %f\n",
        prev_id, res, sumWiDi, sqrt(sumDi2 * sumWi2));
#endif 
        
      ndoc++;
        
      bzero(Di, sizeof(Di));
      if (i < Res->CoordList.ncoords)
      {
        prev_id= Crd[i].url_id;
        prev_num= i;
      }
    }
      
    if (i <= Res->CoordList.ncoords)
      Di[UDM_WRDNUM(Crd[i].coord)]++;
  }
    
  Res->CoordList.ncoords= ndoc;
  return;
  
}


#define NWF_SIZE 256
#define NWF_FACTOR(nwf, n) (nwf[n >= NWF_SIZE ? NWF_SIZE - 1 : n])

void UdmGroupByURL(UDM_AGENT *query,UDM_RESULT *Res)
{
  UDM_STACK_ITEM *temp_items;
  size_t  i, D_size, nitems = Res->nitems;
  size_t  *count, count_size = Res->WWList.nuniq * sizeof(size_t);
  int search_mode = UdmSearchMode(UdmVarListFindStr(&query->Conf->Vars, "m", "all"));
  size_t nsections = UdmVarListFindInt(&query->Conf->Vars, "NumSections", 256);
  size_t numcosine= Res->WWList.nwords * nsections + 1;
  int wf[256], add_cmd = UDM_STACK_AND, inphrase;
  unsigned int *R, *D;
  UDM_PHR_DAT *phr;
  size_t phr_n = 0, phr_m = 256;
  size_t z, Rsum;
  UDM_URL_CRD *Crd= Res->CoordList.Coords;
  UDM_URL_CRD *CrdTo= Res->CoordList.Coords;
  UDM_URL_CRD *CrdFrom= Res->CoordList.Coords + 1;
  UDM_URL_CRD *CrdLast= Res->CoordList.Coords + Res->CoordList.ncoords;
  float Rsum_factor, nword_factor;
  float nwf[NWF_SIZE];
  
  if(!Res->CoordList.ncoords) return;

  if (0)
  {
    UdmGroupByURLNewRel(Res);
    return;
  }
  
  
  UdmWeightFactorsInit(UdmVarListFindStr(&query->Conf->Vars, "wf", ""), wf);
  for (i= 0; i < NWF_SIZE; i++)
  {
    nwf[i]= 0.9 + ((float) i / Res->CoordList.ncoords)*0.1;
  }
  phr = (UDM_PHR_DAT*)UdmMalloc(phr_m * sizeof(UDM_PHR_DAT));
  count = (size_t*)UdmMalloc(count_size);
  D_size = numcosine * sizeof(unsigned int);
  if ((R = (unsigned int*)UdmMalloc(D_size)) == NULL) return;
  if ((D = (unsigned int*)UdmMalloc(D_size)) == NULL)
  {
    UDM_FREE(R);
    return;
  }
  if ((temp_items = (UDM_STACK_ITEM*)UdmMalloc((Res->nitems + 1) * sizeof(UDM_STACK_ITEM) * 2)) == NULL)
  {
    UDM_FREE(R);
    UDM_FREE(D);
    return;
  }


  for(R[0]= 0, Rsum=0, i= 0; i < Res->WWList.nwords; i++)
  {
    size_t secno;
    for (secno= 0; secno < nsections; secno++)
    {
      size_t offs= i * nsections + secno + 1;
      R[offs] = ((wf[secno]) << 2) + UdmOriginWeight(Res->WWList.Word[i].origin);
      Rsum+= R[offs] * R[offs];
    }
  }
  Rsum_factor= 100000 / sqrt(Rsum);
  
  if (Res->ncmds > 0 || search_mode == UDM_MODE_BOOL)
  {
    size_t j;
    switch(search_mode)
    {
      case UDM_MODE_ANY:
        add_cmd = UDM_STACK_OR;
        break;
      case UDM_MODE_BOOL:
      case UDM_MODE_ALL:
        add_cmd = UDM_STACK_AND;
        break;
    }
    temp_items[0] = Res->items[0];
    inphrase = (Res->items[0].cmd == UDM_STACK_PHRASE) ? 1 : 0;

    for (i= 1, j= 1; i < Res->nitems; i++)
    {
      /*
       * If previous item is WORD or PHRASE or RIGHT or STOPWORD
       * and next item is WORD or PHRASE or LEFT or STOPWORD
       * and we are not in phrase
       * we have to insert search mode dependent operator.
       */
      if ((Res->items[i - 1].cmd == UDM_STACK_WORD ||
           Res->items[i - 1].cmd == UDM_STACK_STOP ||
           Res->items[i - 1].cmd == UDM_STACK_PHRASE ||
	   Res->items[i - 1].cmd == UDM_STACK_RIGHT) &&
          (Res->items[i].cmd == UDM_STACK_WORD ||
	   Res->items[i].cmd == UDM_STACK_STOP ||
           Res->items[i].cmd == UDM_STACK_PHRASE ||
	   Res->items[i].cmd == UDM_STACK_LEFT) &&
	  ! inphrase)
      {
        temp_items[j].cmd = add_cmd;
        temp_items[j].arg = 0;
	j++;
      }
      if (Res->items[i].cmd == UDM_STACK_PHRASE) inphrase = ! inphrase;
      temp_items[j++] = Res->items[i];
    }
    search_mode = UDM_MODE_BOOL;
    nitems = j;
  }

  bzero((void*)D, D_size);
  bzero((void*)count, count_size);
  phr[phr_n].position = UDM_WRDPOS(Crd[0].coord);
  phr[phr_n].order = Res->WWList.Word[UDM_WRDNUM(Crd[0].coord)].order;
  count[phr[phr_n].order]++;
  phr_n++;
  D[1 + nsections * UDM_WRDNUM(Crd[0].coord) + UDM_WRDSEC(Crd[0].coord)]=
   wf[UDM_WRDSEC(Crd[0].coord)];


  for( ; CrdFrom < CrdLast; CrdFrom++)
  {
    /* Group by url_id */
    if (CrdTo->url_id == CrdFrom->url_id)
    {
      /* Same document */
      phr[phr_n].position = UDM_WRDPOS(CrdFrom->coord);
      phr[phr_n].order = Res->WWList.Word[UDM_WRDNUM(CrdFrom->coord)].order;
      count[phr[phr_n].order]++;
      if (++phr_n >= phr_m)
      {
        phr_m += 256;
        phr = (UDM_PHR_DAT*)UdmRealloc(phr, phr_m * sizeof(UDM_PHR_DAT));
      }

      D[1 + nsections * UDM_WRDNUM(CrdFrom->coord) + UDM_WRDSEC(CrdFrom->coord)]= 
        (wf[UDM_WRDSEC(CrdFrom->coord)] << 2) +
         UdmOriginWeight(Res->WWList.Word[UDM_WRDNUM(CrdFrom->coord)].origin);
    }
    else
    {
    
      /* Next document */

      nword_factor= NWF_FACTOR(nwf, phr_n);
      
      switch(search_mode)
      {
        case UDM_MODE_BOOL:
          
          CheckPhrase1(temp_items, nitems, phr, phr_n, count);
          if(UdmCalcBoolItems(temp_items, nitems, count))
          {
            D[0]= CalcAverageWordDistance(phr, phr_n);
            CrdTo->coord = UdmCalcCosineWeight(R, D, numcosine, 
                                               Rsum_factor, nword_factor);
            CrdTo++;
          }
          break;
        
        case UDM_MODE_ALL:
          for (z = 0; z < Res->WWList.nuniq; z++) if (count[z] == 0) break;
          if (z < Res->WWList.nuniq && count[z] == 0) break;
        default:
          D[0] = CalcAverageWordDistance(phr, phr_n);
          CrdTo->coord = UdmCalcCosineWeight(R, D, numcosine,
                                             Rsum_factor, nword_factor);
          CrdTo++;
      }

      *CrdTo= *CrdFrom;

      bzero((void*)D, D_size);
      bzero((void*)count, count_size);
      phr_n = 0;
      phr[phr_n].position = UDM_WRDPOS(CrdFrom->coord);
      phr[phr_n].order = Res->WWList.Word[UDM_WRDNUM(CrdFrom->coord)].order;
      count[phr[phr_n].order]++;
      phr_n++;

      D[1 + nsections * UDM_WRDNUM(CrdFrom->coord) + UDM_WRDSEC(CrdFrom->coord)]= 
        (wf[UDM_WRDSEC(CrdFrom->coord)] << 2) +
        UdmOriginWeight(Res->WWList.Word[UDM_WRDNUM(CrdFrom->coord)].origin);
    }
  }

  /* Check last word */

  D[0] = CalcAverageWordDistance(phr, phr_n);
  nword_factor= NWF_FACTOR(nwf, phr_n);
  CrdTo->coord = UdmCalcCosineWeight(R, D, numcosine,
                                     Rsum_factor, nword_factor);

  switch(search_mode)
  {
    case UDM_MODE_BOOL:
      CheckPhrase1(temp_items, nitems, phr, phr_n, count);
      if (UdmCalcBoolItems(temp_items, nitems, count))
        CrdTo++;
      break;
    case UDM_MODE_ALL:
      for (z = 0; z < Res->WWList.nuniq; z++)
        if (count[z] == 0)
          break;
      CrdTo= (z < Res->WWList.nuniq && count[z] == 0) ? CrdTo : CrdTo + 1;
      break;
    default:
      if (CrdTo->coord)
        CrdTo++;
  }

  Res->CoordList.ncoords= CrdTo - Crd;


  UDM_FREE(temp_items);
  UDM_FREE(D);
  UDM_FREE(R);
  UDM_FREE(count);
  UDM_FREE(phr);
  return;
}


void UdmGroupBySite(UDM_AGENT *query, UDM_RESULT *Res)
{
  size_t  i, j = 0, cnt = 1; 
  urlid_t Doc_site;
  UDM_URL_CRD *Crd = Res->CoordList.Coords;
  UDM_URLDATA *Dat = Res->CoordList.Data;
  
  if(!Res->CoordList.ncoords) return;
  if ((Res->PerSite = (size_t*)UdmMalloc(sizeof(*Res->PerSite) * Res->CoordList.ncoords)) == NULL) return;
  
  Doc_site = Dat[0].site_id;
  
  for(i = 1; i < Res->CoordList.ncoords; i++)
  {
    /* Group by site_id */
    if(Doc_site == Dat[i].site_id)
    {
      /* 
        Same site: adjust document rating rating according nuber
        of documents from site, if we need this.
      */
      cnt++;
    }
    else
    {
      /* Next site */
      Res->PerSite[j] = cnt;
      cnt = 1;
      j++;
      Doc_site = Dat[i].site_id;
      Crd[j] = Crd[i];
      Dat[j] = Dat[i];
    }
  }
  Res->PerSite[j] = cnt;
  Res->CoordList.ncoords = j + 1;
  return;
}

typedef struct
{
  uint4	count;
  uint4	weight;
} UDM_PHR_PAR;


/******** Convert category string into 32 bit number *************/

void UdmDecodeHex8Str(const char *hex_str, uint4 *hi,
                      uint4 *lo, uint4 *fhi, uint4 *flo)
{
  char str[33],str_hi[17],str_lo[17], *s = str;

  strncpy(str, hex_str, 13);
  str[12] = '\0';
  strcat(str,"000000000000");
  while(*s == '0') *s++ = ' ';
  strncpy(str_hi,&str[0],6); str_hi[6]=0;
  strncpy(str_lo,&str[6],6); str_lo[6]=0;
  
  *hi = (uint4)strtoul(str_hi, (char **)NULL, 36);
  *lo = (uint4)strtoul(str_lo, (char **)NULL, 36);

  if ((fhi != NULL) && (flo != NULL))
  {
    strncpy(str, hex_str, 13);
    str[12] = '\0';
    strcat(str,"ZZZZZZZZZZZZ");
    strncpy(str_hi, &str[0], 6); str_hi[6] = 0;
    strncpy(str_lo, &str[6], 6); str_lo[6] = 0;
  
    *fhi = strtoul(str_hi, (char **)NULL, 36);
    *flo = strtoul(str_lo, (char **)NULL, 36);
  }
}

int __UDMCALL UdmAddSearchLimit(UDM_AGENT *Agent, int type,
                                const char *file_name, const char *val)
{
  uint4 hi, lo, f_hi, f_lo;
  
  if(Agent->nlimits == MAX_SEARCH_LIMIT - 1) return(1);
  
  Agent->limits[Agent->nlimits].type = type;
  strcpy(Agent->limits[Agent->nlimits].file_name, file_name);
  switch(type)
  {
    case 0: UdmDecodeHex8Str(val, &hi, &lo, &f_hi, &f_lo); break;
    case 1: f_hi = hi = 0; f_lo = lo = 0; break;
    case 2: hi=atoi(val); lo=0; f_hi = hi; f_lo = lo; break;
    case 3: hi=UdmStrHash32(val); lo = 0; f_hi = hi; f_lo = 0; break;
  }  
  Agent->limits[Agent->nlimits].hi = hi;
  Agent->limits[Agent->nlimits].lo = lo;
  Agent->limits[Agent->nlimits].f_hi = f_hi;
  Agent->limits[Agent->nlimits].f_lo = f_lo;
  
  Agent->nlimits++;
  
  UdmLog(Agent, UDM_LOG_DEBUG, "val: %s  %x %x   %x %x", val, hi, lo, f_hi, f_lo);
  
  return(0);
}

int UdmParseQueryString(UDM_AGENT * Agent,
                        UDM_VARLIST * vars,char * query_string)
{
  char * tok, *lt;
  size_t len;
  char *str = (char *)UdmMalloc((len = strlen(query_string)) + 7);
  char *qs = (char*)UdmStrdup(query_string);
  char qname[256];

  if ((str == NULL) || qs == NULL)
  {
    UDM_FREE(str);
    UDM_FREE(qs);
    return 1;
  }

  UdmSGMLUnescape(qs);
  
  tok = udm_strtok_r(qs, "&", &lt);
  while(tok)
  {
    char empty[]="";
    char * val;
    const char * lim;
    
    if((val=strchr(tok,'=')))
    {
      *val='\0';
      val++;
    }
    else
    {
      val=empty;
    }
    UdmUnescapeCGIQuery(str,val);
    UdmVarListAddQueryStr(vars,tok,str);
    udm_snprintf(qname, 256, "query.%s", tok);
    UdmVarListAddQueryStr(vars, qname, str);
    
    sprintf(str,"Limit-%s",tok);
    if((lim=UdmVarListFindStr(vars,str,NULL)))
    {
      int ltype=0;
      const char * type, * fname = NULL;
      char * llt;
      strncpy(str, lim, len);
      
      if((type = udm_strtok_r(str, ":", &llt)))
      {
        if(!strcasecmp(type, "category"))
        {
          ltype = UDM_LIMTYPE_NESTED; fname = UDM_LIMFNAME_CAT;
        }
        else if(!strcasecmp(type, "tag"))
        {
          ltype = UDM_LIMTYPE_LINEAR_CRC; fname = UDM_LIMFNAME_TAG;
        }
        else if(!strcasecmp(type, "time"))
        {
          ltype = UDM_LIMTYPE_TIME; fname = UDM_LIMFNAME_TIME;
        }
        else if(!strcasecmp(type, "hostname"))
        {
          ltype = UDM_LIMTYPE_LINEAR_CRC; fname = UDM_LIMFNAME_HOST;
        }
        else if(!strcasecmp(type, "language"))
        {
          ltype = UDM_LIMTYPE_LINEAR_CRC; fname = UDM_LIMFNAME_LANG;
        }
        else if(!strcasecmp(type, "content"))
        {
          ltype = UDM_LIMTYPE_LINEAR_CRC; fname = UDM_LIMFNAME_CTYPE;
        }
        else if(!strcasecmp(type, "siteid"))
        {
          ltype = UDM_LIMTYPE_LINEAR_INT; fname = UDM_LIMFNAME_SITE;
        }
        if((fname != NULL) && strlen(val))
        {
          UdmAddSearchLimit(Agent,ltype,fname,val);
        }
      }
    }
    tok = udm_strtok_r(NULL, "&", &lt);
  }
  UDM_FREE(str);
  UDM_FREE(qs);
  return 0;
}



static int
UdmWordInWWList(UDM_WIDEWORDLIST *List, int *tok, size_t flen, int hlstop)
{
  size_t uw;
  
  for(uw=0; uw < List->nwords; uw++)
  {
    size_t slen;
    UDM_WIDEWORD *W= &List->Word[uw];
    if (!hlstop && W->origin == UDM_WORD_ORIGIN_STOP)
      continue;
    slen= W->ulen;
    if (flen < slen)
      continue;
    if (flen == slen && !UdmUniStrNCaseCmp(tok, W->uword, slen))
      return 1;
      
    if (flen > slen) switch (List->wm)
    {
      case UDM_MATCH_BEGIN:
        if (!UdmUniStrNCaseCmp(tok, W->uword, slen))
          return 1;
        break;
      case UDM_MATCH_END:
        if (!UdmUniStrNCaseCmp(tok + flen - slen, W->uword, slen))
          return 1;
        break;
      case UDM_MATCH_SUBSTR:
        {
          size_t l1, l2;
          for (l1 = 0; l1 < flen; l1++)
          {
            if (l1 + slen > flen) break;
            for (l2 = 0; l2 < slen; l2++)
            {
              if (UdmUniToLower(tok[l1 + l2]) != UdmUniToLower(W->uword[l2]))
                break;
            }
            if (l2 == slen)
            {
              return 1;
              break;
            }
          }
        }
        break;
    }
  }
  return 0;
}


/* Returns a 0-terminated string */

static char *
UdmHlConvertExtWithConv(UDM_WIDEWORDLIST *List, const char *src,
                        UDM_CONV *lc_uni, UDM_CONV *uni_bc,
                        int hilight_stopwords)
{
  int		*tok, *lt, ctype, *uni;
  int           i0= 0, i2= 2, i3= 3;
  char          *dst;
  size_t        srclen, dstlen= 0, fndlen= 0;
  size_t        dstmaxlen, unimaxlen;
  UDM_CHARSET   *sys_int= &udm_charset_sys_int;

  if(!src)return NULL;
  if ((srclen = strlen(src)) == 0) return NULL;
  
  dstmaxlen= srclen * 14 + 10;
  dst= (char*)UdmMalloc(dstmaxlen);

  /* Convert to unicode */
  unimaxlen= (srclen + 10) * sizeof(int);
  uni= (int *)UdmMalloc(unimaxlen);
  UdmConv(lc_uni,(char*)uni, unimaxlen, src, srclen + 1);

  /* Parse unicode string */
  for (tok= UdmUniGetSepToken(uni, &lt, &ctype) ; tok ;
       tok= UdmUniGetSepToken(NULL, &lt, &ctype))
  {
    int found= 0;
    size_t flen= lt - tok;

    /* Check that it is word to be marked */
    if (ctype != UDM_UNI_SEPAR && List != NULL)
      found= UdmWordInWWList(List, tok, flen, hilight_stopwords);
    
    if (found)
    {
      dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i2, sizeof(i2));
      fndlen+= flen;
    }
    if (uni_bc->to == sys_int)
    {
      memcpy(dst + dstlen, tok, sizeof(*tok) * flen);
      dstlen+= sizeof(*tok) * flen;
    }
    else
      dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) tok, sizeof(*tok) * flen);
    
    if (found)
      dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i3, sizeof(i3));
  }

  dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i0, sizeof(i0));

  UdmFree(uni);
  return dst;

}


char * UdmHlConvertExt(UDM_WIDEWORDLIST *List,const char * src,
                       UDM_CHARSET * lcs, UDM_CHARSET * bcs, int hlstop)
{
  UDM_CONV lc_uni, uni_bc;
  UdmConvInit(&lc_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  UdmConvInit(&uni_bc, &udm_charset_sys_int, bcs, UDM_RECODE_HTML);
  return UdmHlConvertExtWithConv(List, src, &lc_uni, &uni_bc, hlstop);
}


/* For PHP module compatibility */
char * UdmHlConvert(UDM_WIDEWORDLIST *List,const char * src,
                    UDM_CHARSET * lcs, UDM_CHARSET * bcs)
{
  return UdmHlConvertExt(List, src, lcs, bcs, 1);
}


int UdmConvert(UDM_ENV *Conf, UDM_RESULT *Res,
               UDM_CHARSET *lcs, UDM_CHARSET *bcs)
{
  size_t i;
  UDM_CONV lc_bc, lc_uni, uni_bc;
  int hlstop= UdmVarListFindBool(&Conf->Vars, "ExcerptStopword", 1);

  /* Init converters */
  UdmConvInit(&lc_bc,lcs,bcs,UDM_RECODE_HTML);
  UdmConvInit(&lc_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  UdmConvInit(&uni_bc, &udm_charset_sys_int, bcs, UDM_RECODE_HTML);
  
  /* Convert word list */
  for(i=0;i<Res->WWList.nwords;i++)
  {
    UDM_WIDEWORD *W=&Res->WWList.Word[i];
    size_t len= strlen(W->word);
    char *newval= (char*)UdmMalloc(len * 12 + 1);
    
    UdmConv(&lc_bc,newval,len*12+1,W->word,len+1);
    UDM_FREE(W->word);
    W->word=newval;
  }
  
  /* Convert document sections */
  for(i=0;i<Res->num_rows;i++)
  {
    UDM_DOCUMENT  *D=&Res->Doc[i];
    size_t    sec;
    
    for(sec=0; sec < D->Sections.nvars; sec++)
    {
      UDM_VAR *Var= &D->Sections.Var[sec];
      
      /*
         A temporary fix to skip URL and CachedCopy:
         We will skip these sections for now.
         But this need a further fix in search.htm
         to distinguish two HTML formats:
         - HTML with &<>" escaped to &amp;&lt;&gt;&quot;
         - HTML with &<>" printed as is, no word hilight
         - HTML with &<>" printed as is, search word hilighted.
      */
      
      if (strcasecmp(Var->name,"URL") &&
          strcasecmp(Var->name,"CachedCopy") &&
          strcasecmp(Var->name,"Content-Type"))
      {
        char *newval= UdmHlConvertExtWithConv(&Res->WWList, Var->val,
                                              &lc_uni, &uni_bc, hlstop);
        UDM_FREE(Var->val);
        Var->val= newval;
      }
    }
  }
  
  /* Convert Env->Vars */
  for (i = 0; i < Conf->Vars.nvars; i++)
  {
    UDM_VAR *Var= &Conf->Vars.Var[i];
    if (UdmVarType(Var) == UDM_VAR_STR &&
        strcasecmp(Var->name, "HlBeg") &&
        strcasecmp(Var->name, "HlEnd"))
    {
      size_t len= strlen(Var->val);
      char *newval= (char*)UdmMalloc(len * 12 + 1);
    
      UdmConv(&lc_bc, newval, len * 12 + 1, Var->val, len + 1);
      UDM_FREE(Var->val);
      Var->val= newval;
    }
  }
  
  return UDM_OK;
}


char* UdmRemoveHiLightDup(const char *s)
{
  size_t len=strlen(s)+1;
  char   *d, *res = (char*)UdmMalloc(len);
  
  for(d=res; s[0]; s++)
  {
    switch(s[0])
    {
      case '\2':
      case '\3':
        break;
      case '&':
        if (s[1] == '#')
        {
          const char *e;
          int code= 0;
          
          for (e= s+2; (*e >= '0') && (*e <= '9'); code= code*10 + e[0]-'0', e++);
          if (*e == ';')
          {
            *d++= (code < 128) ? code : '?';
            s= e;
            break;
          }
        }
        /* pass through */
        
      default:
        *d++=*s;
    }
  }
  *d='\0';
  return res;
}



int UdmCatToTextBuf(UDM_CATEGORY *C, char *textbuf, size_t len)
{
  char  *end;
  size_t  i;
  
  textbuf[0]='\0';
  
  end = textbuf;
  
  for(i = 0; i < C->ncategories; i++)
  {
    udm_snprintf(end, len - strlen(textbuf),
                 "<CAT\tid=\"%d\"\tpath=\"%s\"\tlink=\"%s\"\tname=\"%s\">\r\n",
                 C->Category[i].rec_id, C->Category[i].path,
                 C->Category[i].link, C->Category[i].name);
    end = end + strlen(end);
  }
  return UDM_OK;
}

int UdmCatFromTextBuf(UDM_CATEGORY *C, char *textbuf)
{
  const char  *htok, *last;
  UDM_HTMLTOK  tag;
  size_t    i, c;
  
  if (textbuf == NULL) return UDM_OK;
  UdmHTMLTOKInit(&tag);
  
  htok=UdmHTMLToken(textbuf,&last,&tag);
  
  if(!htok || tag.type != UDM_HTML_TAG)
    return UDM_OK;

  C->Category = (UDM_CATITEM*)UdmRealloc(C->Category, sizeof(UDM_CATITEM) * ((c = C->ncategories) + 1));
  bzero((void*)&C->Category[c], sizeof(UDM_CATITEM));
  
  for(i = 1; i < tag.ntoks; i++)
  {
    size_t  nlen = tag.toks[i].nlen;
    size_t  vlen = tag.toks[i].vlen;
    char  *name = UdmStrndup(tag.toks[i].name, nlen);
    char  *data = UdmStrndup(tag.toks[i].val, vlen);

    if (!strcmp(name, "id"))
    {
      C->Category[c].rec_id = atoi(data);
    }
    else if (!strcmp(name, "path"))
    {
      strncpy(C->Category[c].path, data, 128);
    }
    else if (!strcmp(name, "link"))
    {
      strncpy(C->Category[c].link, data, 128);
    }
    else if (!strcmp(name, "name"))
    {
      strncpy(C->Category[c].name, data, 128);
    }

    UDM_FREE(name);
    UDM_FREE(data);
  }

  C->ncategories++;
  return UDM_OK;
}

int *UdmUniSegment(UDM_AGENT *Indexer, int *ustr, const char *lang)
{
  UDM_CHARSET  *sys_int= &udm_charset_sys_int;
  size_t       reslen, dstlen = UdmUniLen(ustr);
  const char   *seg=  UdmVarListFindStr(&Indexer->Conf->Vars, "Segmenter", NULL);

#ifdef CHASEN
  if ((!seg  || !strcasecmp(seg, "Chasen")) &&
      (!lang || !strncasecmp(lang, "ja", 2)))
  {
    char        *eucstr, *eucstr_seg;
    UDM_CHARSET *eucjp_cs;
    UDM_CONV    uni_eucjp, eucjp_uni;
    
    eucjp_cs = UdmGetCharSet("euc-jp");
    if (!eucjp_cs) eucjp_cs = &udm_charset_sys_int;
    UdmConvInit(&uni_eucjp, sys_int, eucjp_cs, UDM_RECODE_HTML);
    UdmConvInit(&eucjp_uni, eucjp_cs, sys_int, UDM_RECODE_HTML);
    eucstr = (char*)UdmMalloc(12 * dstlen + 1);
    UdmConv(&uni_eucjp, eucstr, 12 * dstlen + 1, (char*)ustr, sizeof(*ustr)*(dstlen + 1));
    
    UDM_GETLOCK(Indexer, UDM_LOCK_SEGMENTER);
    eucstr_seg = chasen_sparse_tostr(eucstr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_SEGMENTER);
    
    reslen = strlen(eucstr_seg) + 1;
    ustr = (int*)UdmRealloc(ustr, reslen * sizeof(int));
    UdmConv(&eucjp_uni, (char*)ustr, reslen * sizeof(int), eucstr_seg, reslen);
    UDM_FREE(eucstr);
    dstlen = UdmUniLen(ustr);
    return ustr;
  }
#endif


#ifdef MECAB
  if ((!seg  || !strcasecmp(seg, "Mecab")) &&
      (!lang || !strncasecmp(lang, "ja", 2)))
  {
    UDM_CHARSET *sjis_cs;
    UDM_CONV    uni_sjis, sjis_uni;
    char        *sjisstr, *sjisstr_seg;

    sjis_cs = UdmGetCharSet("euc-jp");
    if (!sjis_cs) sjis_cs = &udm_charset_sys_int;
    UdmConvInit(&uni_sjis, sys_int, sjis_cs, UDM_RECODE_HTML);
    UdmConvInit(&sjis_uni, sjis_cs, sys_int, UDM_RECODE_HTML);

    sjisstr = (char*)UdmMalloc(12 * dstlen + 1);
    UdmConv(&uni_sjis, sjisstr, 12 * dstlen + 1, (char*)ustr, sizeof(*ustr) * (dstlen + 1));

    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
#ifdef HAVE_PTHREADS
    mecab_lock(Indexer->Conf->mecab);
#endif
    sjisstr_seg = mecab_sparse_tostr(Indexer->Conf->mecab, sjisstr);
#ifdef HAVE_PTHREADS
    mecab_unlock(Indexer->Conf->mecab);
#endif
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);

    reslen = strlen(sjisstr_seg) + 1;
    ustr = (int*)UdmRealloc(ustr, reslen * sizeof(int));
    UdmConv(&sjis_uni, (char*)ustr, reslen * sizeof(int), sjisstr_seg, reslen);
    UDM_FREE(sjisstr);
    dstlen = UdmUniLen(ustr);
    return ustr;
  }
#endif


#ifdef HAVE_CHARSET_gb2312
  if ((!seg  || !strcasecmp(seg, "Freq")) &&
      (!lang || !strncasecmp(lang, "zh", 2)))
  {
    int *seg_ustr;
    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
    seg_ustr = UdmSegmentByFreq(&Indexer->Conf->Chi, ustr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
    if (seg_ustr != NULL)
    {
      UDM_FREE(ustr);
      ustr = seg_ustr;
    }
    dstlen = UdmUniLen(ustr);
    return ustr;
  }
#endif

  if ((!seg  || !strcasecmp(seg, "Freq")) &&
      (!lang || !strncasecmp(lang, "th", 2)))
  {
    int *seg_ustr;
    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
    seg_ustr = UdmSegmentByFreq(&Indexer->Conf->Thai, ustr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
    if (seg_ustr != NULL)
    {
      UDM_FREE(ustr);
      ustr = seg_ustr;
    }
    dstlen = UdmUniLen(ustr);
    return ustr;
  }

  return ustr;
}


int UdmResWordInfo(UDM_ENV *Env, UDM_RESULT *Res)
{
  size_t  len, i, j, wsize;
  char  *wordinfo= NULL, *end;
  size_t  corder= (size_t)-1, ccount= 0;
  int have_suggestions= 0;
  
  for(len = i = 0; i < Res->WWList.nwords; i++) 
    len += Res->WWList.Word[i].len + 64;
  
  wsize= (1+len)*sizeof(char);
  wordinfo= (char*) UdmMalloc(wsize);
  *wordinfo= '\0';
  
  for(i = 0; i < Res->WWList.nwords; i++)
  {
    if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_QUERY ||
        Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_SPELL ||
        Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_SYNONYM)
    {
      if(wordinfo[0])
        strcat(wordinfo,", ");
      sprintf(UDM_STREND(wordinfo)," %s : %d", Res->WWList.Word[i].word, Res->WWList.Word[i].count);
    }
    else if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_STOP)
    {
      if(wordinfo[0])
        strcat(wordinfo,", ");
      sprintf(UDM_STREND(wordinfo)," %s : stopword", Res->WWList.Word[i].word);
    }
  }
  
  UdmVarListReplaceStr(&Env->Vars, "WE", wordinfo);
  
  *wordinfo = '\0';
  for(i = 0; i < Res->WWList.nwords; i++)
  {
    corder= Res->WWList.Word[i].order;
    ccount= 0;
    for(j= 0; j < Res->WWList.nwords; j++)
      if (Res->WWList.Word[j].order == corder)
        ccount += Res->WWList.Word[j].count;
    if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_STOP)
    {
      sprintf(UDM_STREND(wordinfo),"%s%s : stopword", (*wordinfo) ? ", " : "",  Res->WWList.Word[i].word);
    }
    else if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_QUERY)
    {
      sprintf(UDM_STREND(wordinfo),"%s%s : %d / %d", 
        (*wordinfo) ? ", " : "", Res->WWList.Word[i].word, Res->WWList.Word[i].count, ccount);
    }
  }
  UdmVarListReplaceStr(&Env->Vars, "W", wordinfo);
  
  *wordinfo= '\0';
  end= wordinfo;
  for (i= 0; i < Res->WWList.nwords; i++)
  {
    UDM_WIDEWORD *Wi= &Res->WWList.Word[i];
    UDM_WIDEWORD *Wb= NULL;
   
    if (Wi->origin == UDM_WORD_ORIGIN_QUERY)
    {
      if (Wi->count > 0)
      {
        Wb= Wi;
      }
      else
      {
        ccount= 0;
        for (j= 0; j < Res->WWList.nwords; j++)
        {
          UDM_WIDEWORD *Wj= &Res->WWList.Word[j];
          if (Wj->origin == UDM_WORD_ORIGIN_SUGGEST &&
              Wj->order == Wi->order && Wj->count > ccount)
          {
            ccount= Res->WWList.Word[j].count;
            Wb= Wj;
            have_suggestions= 1;
          }
        }
      }
    }
    else if (Wi->origin == UDM_WORD_ORIGIN_STOP)
    {
      Wb= Wi;
    }
    
    if (Wb)
    {
      sprintf(end, "%s%s", wordinfo[0] ? " " : "", Wb->word);
      end= end + strlen(end);
    }
  }
  
  if (have_suggestions)
    UdmVarListReplaceStr(&Env->Vars, "WS", wordinfo);
  UDM_FREE(wordinfo);
  return UDM_OK;
}
