/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include <udm_config.h>

#include <stdlib.h>
#include <string.h>
#include <errno.h>

#include <sys/types.h>
#ifdef   HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_ZLIB
#include <zlib.h>
#endif

#include "udm_store.h"
#include "udm_services.h"
#include "udm_xmalloc.h"
#include "udm_hash.h"
#include "udm_utils.h"
#include "udm_log.h"
#include "udm_vars.h"
#include "udm_parsehtml.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_searchtool.h"
#include "udm_sgml.h"
#include "udm_sqldbms.h"
#include "udm_mutex.h"
#include "udm_doc.h"
#include "udm_db.h"

static int UdmUniNSpace(int c) {
     if (c == 0x0020) return 0;
     if (c == 0x000D) return 0;
     if (c == 0x000A) return 0;
     if (c == 0x0009) return 0;
     if (c == 0x1680) return 0;
     if ((c >= 0x2000) && (c <= 0x200B)) return 0;
     if (c == 0x202F) return 0;
     if (c == 0x3000) return 0;
     return 1;
}

/*
static int * UdmUniStrWWL(int *s, UDM_WIDEWORDLIST *wwl, int *c, size_t *len) {
  int sc;
  register size_t i;

  while((sc = UdmUniToLower(*s++)) != 0) {
    for(i = 0; i < wwl->nwords; i++) {
      if ((sc == c[i]) && (wwl->Word[i].origin != UDM_WORD_ORIGIN_STOP)) {
     if ((UdmUniStrNCaseCmp(s, &(wwl->Word[i].uword[1]), len[i]) == 0)
         && ( (UdmUniNSpace(s[len[i]]) == 0) || (s[len[i]] == 0) || (s[len[i]] < 0x30 )  )  ) {
       s--;
       return s;
     }
      }
    }
  }
  return NULL;
}

__C_LINK char * __UDMCALL UdmExcerptDoc(UDM_AGENT *query, UDM_RESULT *Res, UDM_DOCUMENT *Doc, size_t size, size_t padding) {
  char *HDoc,*HEnd;
  const char *htok, *last = NULL;
  const char *lcharset;
  UDM_CHARSET *lcs = NULL, *dcs = NULL, *sys_int;
  UDM_HTMLTOK tag;
  int *start, *end, *prevend, *uni, ures, *p, *oi, *np, *c;
  int dot[] = {0x2e, 0x2e, 0x2e, 0};
  size_t *wlen, i, len, maxwlen = 0, ulen, prevlen, osl;
  UDM_CONV dc_uni, uni_lc;
  size_t DocSize;
  char *os, *Source = NULL;
  const char *CachedCopy;
  
  if (query->Conf->lcs == NULL)
  {
    lcharset = UdmVarListFindStr(&query->Conf->Vars, "CS", "");
    if (lcharset == NULL || (!strcmp(lcharset, "")))
    {
      lcharset = UdmVarListFindStr(&query->Conf->Vars, "LocalCharset", "iso-8859-1");
    }
    lcs = UdmGetCharSet(lcharset);
  } 
  else
  {
    lcs = query->Conf->lcs;
  }
  dcs = UdmGetCharSet(UdmVarListFindStr(&Doc->Sections,"Charset","iso-8859-1"));
  if (!lcs || !dcs) return NULL;

  if (!(sys_int=UdmGetCharSet("sys-int")))
    return NULL;
  
  UdmConvInit(&dc_uni, dcs, sys_int, UDM_RECODE_HTML);
  UdmConvInit(&uni_lc,sys_int,lcs,UDM_RECODE_HTML);

  c = (int *) UdmMalloc(Res->WWList.nwords * sizeof(int));
  if (c == NULL) {  return NULL; }
  wlen = (size_t *) UdmMalloc(Res->WWList.nwords * sizeof(size_t));
  if (wlen == NULL)
  {
    UDM_FREE(c);
    return NULL;
  }
  for (i = 0; i < Res->WWList.nwords; i++)
  {
    wlen[i] = Res->WWList.Word[i].len - 1;
    c[i] = Res->WWList.Word[i].uword[0];
    if (wlen[i] > maxwlen) maxwlen = wlen[i];
  }
  if ((oi = (int *)UdmMalloc(2 * (udm_max(size,maxwlen+2*padding+8) + 1) * sizeof(int))) == NULL)
  {
    UDM_FREE(c);
    UDM_FREE(wlen);
    return NULL;
  }
  oi[0]=0;
  
  
  DocSize = 1+ UdmVarListFindInt(&Doc->Sections, "Content-Length", UDM_MAXDOCSIZE);
  if ((DocSize == 0) || ((HEnd=HDoc = (char *)UdmMalloc(DocSize)) == NULL))
  {
    UDM_FREE(oi);
    UDM_FREE(c);
    UDM_FREE(wlen);
    return NULL;
  }
  HDoc[0]='\0';
  
  
  if ( (uni = (int *)UdmMalloc((DocSize + 10) * sizeof(int)) ) == NULL)
  {
    UDM_FREE(oi);
    UDM_FREE(c);
    UDM_FREE(wlen);
    UDM_FREE(HDoc);
    return NULL;
  }
  
#ifdef HAVE_ZLIB
  CachedCopy = UdmVarListFindStr(&Doc->Sections, "CachedCopy", NULL);
  if (CachedCopy)
  {
    size_t l;
    char *in_buf;
    z_stream zstream;
    
    l = strlen(CachedCopy);
    Source = UdmMalloc(UDM_MAXDOCSIZE);
    
    in_buf = UdmMalloc(l);
    zstream.next_in = (Byte *)in_buf;
    zstream.avail_in = udm_base64_decode((char *)zstream.next_in, CachedCopy, l);
    zstream.next_out = (Byte *)Source;
    zstream.avail_out = UDM_MAXDOCSIZE-1;
    zstream.zalloc = Z_NULL;
    zstream.zfree = Z_NULL;
    zstream.opaque = Z_NULL;
    
    if (inflateInit2(&zstream, 15) != Z_OK)
    {
      UdmFree(Source);
      UdmFree(in_buf);
      return(NULL);
    }
    
    inflate(&zstream, Z_FINISH);
    inflateEnd(&zstream);
    Source[zstream.total_out] = 0;
    UdmFree(in_buf);
  } 
  else
  {
#endif
    Source = (char*)UdmStrdup(UdmVarListFindStr(&Doc->Sections, "body", ""));
#ifdef HAVE_ZLIB
  }
#endif
  
  
  UdmHTMLTOKInit(&tag); 
  htok = UdmHTMLToken(Source, &last, &tag);
  for (len = 0; (len == 0) && htok; )
  {
    switch(tag.type)
    {
      case UDM_HTML_TXT:
        if (tag.script == 0 && tag.comment == 0 && tag.style == 0) 
        {
          memcpy(HEnd, htok, (size_t)(last-htok));
          HEnd += last - htok;
          HEnd[0] = '\0';
          len = HEnd - HDoc;
        }
        break;
      case UDM_HTML_COM:
      case UDM_HTML_TAG:
      default:
        break;
    }
    htok = UdmHTMLToken(NULL, &last, &tag);
  }
  
  
  if (HEnd == HDoc)
  {
    UDM_FREE(oi); 
    UDM_FREE(c); 
    UDM_FREE(wlen); 
    UDM_FREE(HDoc); 
    UDM_FREE(uni);
    UDM_FREE(Source);
    return NULL;
  }
  
  prevlen = 0;
  ulen = 0;
  
  UdmConv(&dc_uni, (char*)uni, sizeof(*uni)*(DocSize+10), HDoc, len + 1) / sizeof(*uni);
  prevlen = len;
  ulen += UdmUniLen(uni);
  
  
  for (p = prevend = uni; UdmUniLen(oi) < size; )
  {
    while((np  = UdmUniStrWWL(p, &(Res->WWList), c, wlen)) == NULL)
    {
      while(htok && (len == prevlen))
      {
        switch(tag.type)
        {
          case UDM_HTML_TXT:
            if (tag.script == 0 && tag.comment == 0 && tag.style == 0)
            {
              memcpy(HEnd, htok, (size_t)(last-htok));
              HEnd += last - htok;
              HEnd[0] = '\0';
              len = HEnd - HDoc;
            }
            break;
          case UDM_HTML_COM:
          case UDM_HTML_TAG:
          default:
            break;
        }
        htok = UdmHTMLToken(NULL, &last, &tag);
      }

      if (len == prevlen) break;

      UdmConv(&dc_uni, (char*)(uni + ulen), sizeof(*uni)*(DocSize + 10 - ulen), HDoc + prevlen, len - prevlen + 1) / sizeof(*uni);
      prevlen = len;
      p = (ulen < maxwlen) ? uni : (uni + (ulen - maxwlen));
      ulen += UdmUniLen(uni+ulen);

    }
    
    
    if (np == NULL) break;
    p = np;
    if ( ( (p > uni) && ( (!UdmUniNSpace(*(p-1))) || 
                        ( (*(p-1)) < 0x30) ) ) || 
                        (p == uni)  )
    {
      start = udm_max(udm_max(p - padding, uni), prevend);
      end = udm_min(p + maxwlen + padding, uni + ulen);
      while(UdmUniNSpace(*start) && (start < p) && (*start > 0x2F)) start++;
      while(UdmUniNSpace(*end) && (p < end) && (*end > 0x2F)) end--;
      if (start != uni) UdmUniStrCat(oi, dot);
      if (*end == 3) end++;
      ures = *end; *end = 0; UdmUniStrCat(oi, start); *end = ures;
      if ((end != uni + ulen) && (start != prevend)) UdmUniStrCat(oi, dot);
      p = prevend = end;
    }
    if (*p) p++;
  }
  
  
  osl = (UdmUniLen(oi) + 1) * sizeof(char);
  if ((os = (char *)UdmMalloc(osl * 12)) == NULL)
  {
    UDM_FREE(oi); 
    UDM_FREE(c); 
    UDM_FREE(wlen); 
    UDM_FREE(HDoc); 
    UDM_FREE(uni);
    UDM_FREE(Source);
    return NULL;
  }
  
  
  UdmConv(&uni_lc, os, osl * 12, (char*)oi, sizeof(*oi) * osl);
  os[osl - 1]='\0';
  
  {
    char *cc;
    for (cc= os; cc[0]; cc++)
      if (cc[0] == '\n' || cc[0] == '\t' || cc[0] =='\r')
        cc[0]=' ';
  }
  
  UDM_FREE(c);
  UDM_FREE(wlen);
  UDM_FREE(oi);
  UDM_FREE(HDoc);
  UDM_FREE(uni);
  UDM_FREE(Source);
  
  return os;
}
*/


static int *UdmGetExcerptSourceCachedCopy (UDM_AGENT *Agent, UDM_RESULT *Res, UDM_DOCUMENT *Doc) {
#ifdef HAVE_ZLIB
	int *_;
	size_t l;
	size_t ul;
	char *Source;
	char *in_buf;
	z_stream zstream;
	UDM_DSTR buf;
	UDM_CHARSET *sys_int = UdmGetCharSet("sys-int");
	UDM_CHARSET *dcs = UdmGetCharSet(UdmVarListFindStrTxt(&Doc->Sections, "Charset", "iso-8859-1"));
	const char *CachedCopy = UdmVarListFindStrTxt(&Doc->Sections, "CachedCopy", NULL);
	UDM_HTMLTOK tag;
	const char *htok, *last;
	UDM_CONV conv;

	if (! CachedCopy) return(NULL);
	if (! sys_int || ! dcs) return(NULL);
	l = strlen(CachedCopy);
	Source = UdmMalloc(UDM_MAXDOCSIZE);
	if (! Source) return(NULL);
	in_buf = UdmMalloc(l);
	if (! in_buf) {
		UdmFree(Source);
		return(NULL);
	}
	zstream.next_in = (Byte *)in_buf;
	zstream.avail_in = udm_base64_decode((char *)zstream.next_in, CachedCopy, l);
	zstream.next_out = (Byte *)Source;
	zstream.avail_out = UDM_MAXDOCSIZE - 1;
	zstream.zalloc = Z_NULL;
	zstream.zfree = Z_NULL;
	zstream.opaque = Z_NULL;

	if (inflateInit2(&zstream, 15) != Z_OK) {
		UdmFree(Source);
		UdmFree(in_buf);
		return(NULL);
	}

        inflate(&zstream, Z_FINISH);
	inflateEnd(&zstream);
	Source[zstream.total_out] = 0;
	UdmFree(in_buf);

	UdmDSTRInit(&buf, 1024);
	UdmHTMLTOKInit(&tag);
	htok = UdmHTMLToken(Source, &last, &tag);
	do {
		if (tag.type == UDM_HTML_TXT && ! tag.script && ! tag.comment && ! tag.style) {
			UdmDSTRAppend(&buf, htok, last - htok);
		} else {
			UdmDSTRAppend(&buf, " ", 1);
		}
	} while ((htok = UdmHTMLToken(NULL, &last, &tag)));
	UdmFree(Source);

	Source = UdmHlConvert(&Res->WWList, buf.data, dcs, dcs);
	UdmDSTRFree(&buf);
	if (! Source) return(NULL);
	l = strlen(Source);

	_ = UdmMalloc(sizeof(int) * (l + 1));
	if (! _) {
		UdmFree(Source);
		return(NULL);
	}

	UdmConvInit(&conv, dcs, sys_int, UDM_RECODE_HTML);
	ul = UdmConv(&conv, (char *)_, sizeof(int) * (l + 1), Source, l);
	UdmFree(Source);
	if (ul < 0) {
		UdmFree(_);
		return(NULL);
	}
	_[ul / sizeof(int)] = 0;
	return(_);
#else
	return(NULL);
#endif
}

static int *UdmGetExcerptSourceBody (UDM_AGENT *Agent, UDM_RESULT *Res, UDM_DOCUMENT *Doc) {
	int *_;
	size_t l;
	size_t ul;
	const char *Source = UdmVarListFindStr(&Doc->Sections, "body", NULL);
	UDM_CHARSET *sys_int = UdmGetCharSet("sys-int");
	UDM_CHARSET *cs = UdmGetCharSet(UdmVarListFindStr(&Agent->Conf->Vars, "BrowserCharset", "iso-8859-1"));
	UDM_CONV conv;

	if (! Source) return(NULL);
	if (! sys_int || ! cs) return(NULL);

	l = strlen(Source);
	_ = UdmMalloc(sizeof(int) * (l + 1));
	if (! _) return(NULL);

	UdmConvInit(&conv, cs, sys_int, UDM_RECODE_HTML);
	ul = UdmConv(&conv, (char *)_, sizeof(int) * (l + 1), Source, l);
	if (ul < 0) {
		UdmFree(_);
		return(NULL);
	}
	_[ul / sizeof(int)] = 0;
	return(_);
}

__C_LINK char * __UDMCALL UdmExcerptDoc (UDM_AGENT *Agent, UDM_RESULT *Res, UDM_DOCUMENT *Doc, size_t ExcerptSize, size_t ExcerptPadding) {
	char *_;
	size_t ul;
	size_t l;
	size_t i, j;
	size_t left, right;
	size_t prev_right = 0;
	int *Source;
	UDM_CONV conv;
	UDM_CHARSET *sys_int = UdmGetCharSet("sys-int");
	UDM_CHARSET *cs = UdmGetCharSet(UdmVarListFindStr(&Agent->Conf->Vars, "BrowserCharset", "iso-8859-1"));
	UDM_DSTR buf;
	int dots[] = { 0x2E, 0x2E, 0x2E };

	if (! sys_int || ! cs) return(NULL);
	Source = UdmGetExcerptSourceCachedCopy(Agent, Res, Doc);
	if (! Source) Source = UdmGetExcerptSourceBody(Agent, Res, Doc);
	if (! Source) return(NULL);

	ul = UdmUniLen(Source);

	/* Strip whitespaces */
	for (i = 0, j = 0; i < ul; i++) {
		if (UdmUniNSpace(Source[i])) Source[j++] = Source[i];
		else if (j && UdmUniNSpace(Source[j - 1])) Source[j++] = 0x20;
	}
	Source[j] = 0;
	ul = j;

	/* Get excerpt */
	UdmDSTRInit(&buf, 1024);
	for (i = 0; i < ul; i++) {
		if (Source[i] == 2) {
			for (j = i + 1; j < ul; j++) if (Source[j] == 3) break;

			left = ExcerptPadding < i ? i - ExcerptPadding : 0;
			if (left < prev_right) left = prev_right;
			if (left) while (left < i && Source[left] != 0x20) left++;

			right = ExcerptPadding + j;
			if (right >= ul) right = ul - 1;
			else while (right > j && Source[right] != 0x20) right--;

			if (ExcerptSize < buf.size_data / sizeof(int) + right - left + 1) break;
			if (left != prev_right) UdmDSTRAppend(&buf, (char *)dots, sizeof(dots));
			UdmDSTRAppend(&buf, (char *)&Source[left], (right - left + 1) * sizeof(int));
			i = right;
			prev_right = right;
		}
	}
	if (! buf.size_data) {
		ul = ExcerptSize > ul ? ul : ExcerptSize;
		UdmDSTRAppend(&buf, (char *)Source, ul * sizeof(int));
	}
	UdmFree(Source);

	ul = buf.size_data / sizeof(int) * 20;
	_ = UdmMalloc(ul);
	if (! _) {
		UdmDSTRFree(&buf);
		return(NULL);
	}
	UdmConvInit(&conv, sys_int, cs, UDM_RECODE_HTML);
	l = UdmConv(&conv, _, ul, buf.data, buf.size_data);
	UdmDSTRFree(&buf);
	_[l] = 0;
	return(_);
}
