/*************************************************************************
 *
 *  $RCSfile: Indexer.java,v $
 *
 *  $Revision: 1.2 $
 *
 *  last change: $Author: rt $ $Date: 2005/01/27 10:08:54 $
 *
 *  The Contents of this file are made available subject to the terms of
 *  either of the following licenses
 *
 *         - GNU Lesser General Public License Version 2.1
 *         - Sun Industry Standards Source License Version 1.1
 *
 *  Sun Microsystems Inc., October, 2000
 *
 *  GNU Lesser General Public License Version 2.1
 *  =============================================
 *  Copyright 2000 by Sun Microsystems, Inc.
 *  901 San Antonio Road, Palo Alto, CA 94303, USA
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License version 2.1, as published by the Free Software Foundation.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *  MA  02111-1307  USA
 *
 *
 *  Sun Industry Standards Source License Version 1.1
 *  =================================================
 *  The contents of this file are subject to the Sun Industry Standards
 *  Source License Version 1.1 (the "License"); You may not use this file
 *  except in compliance with the License. You may obtain a copy of the
 *  License at http://www.openoffice.org/license.html.
 *
 *  Software provided under this License is provided on an "AS IS" basis,
 *  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING,
 *  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
 *  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
 *  See the License for the specific provisions governing your rights and
 *  obligations concerning the Software.
 *
 *  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
 *
 *  Copyright: 2000 by Sun Microsystems, Inc.
 *
 *  All Rights Reserved.
 *
 *  Contributor(s): _______________________________________
 *
 *
 ************************************************************************/
package com.sun.xmlsearch.indexer;

import sunw.html.*;
import java.net.URL;
import java.io.*;
import java.text.*;
import java.util.*;
import com.sun.xmlsearch.util.*;

public class Indexer {
  private sunw.html.Parser _parser;
  private DTD dtd = null;
  private IndexBuilder _indexBuilder = null;
  private Hashtable _stoplist = new Hashtable();
  private String dbName = null;
  private PrintStream verbose=null;
  private String filelist = null;
  private String crawllist = null;
  private String _directory = null;

  public Indexer() {
    Properties props = new Properties(System.getProperties());
    try {
      props.load(new FileInputStream("resources/properties"));
    } catch (IOException e) {
      System.err.println("No system properties file");
      e.printStackTrace(System.err);
      return;
    }
    System.setProperties(props);
    // !!! set doc: in properties
    
    _parser = new sunw.html.Parser() {
      protected void handleText(char text[]) {
	indexText(new String(text), begin);
      }
    };
    
    readStoplist("resources/NRStop");
  }
  
  private void process(String[] args) throws Exception {
    String charsetName = "8859_1";
    String sourcepath = "";
    String configFile = null;
    Vector files = new Vector();
    Vector tmpfiles = new Vector();
    int i;
    for (i = 0; i < args.length ; i++) {
      if(args[i].equals("-db")) {
	if ((i + 1) < args.length) {
	  dbName = args[++i];
	  // Make sure the last character is a file separator
	  if (dbName.lastIndexOf(File.separatorChar)
	      != dbName.length() - 1) {
	    dbName = dbName.concat(File.separator);
	  }
	} else {
	  System.out.println(args[i] + "-db requires argument");
	}
      }
      else if (args[i].equals("-charset") && (i+1 < args.length))
	charsetName = args[++i];
      else if (args[i].equals("-filelist") && (i+1 < args.length))
	filelist = args[++i];
      else if (args[i].equals("-crawllist") && (i+1 < args.length))
	crawllist = args[++i];
      else if (args[i].equals("-dir") && (i+1 < args.length)) {
	_directory = args[++i];
	filelist = _directory + File.separator + filelist;
      }
      else if (args[i].equals("-dtd") && (i+1 < args.length) && (dtd == null)) 
	dtd = DTD.getDTD(args[++i]);
      else if(args[i].equals("-c")) {
	if ((i + 1) < args.length) {
	  configFile = args[++i];
	} else {
	  System.out.println (args[i] + "-c requires argument");
	}
      }
      else if (args[i].startsWith("-")) {
	System.out.println("invalid argument: " + args[i]);
	System.exit(1);
      }
      else 
	break;
    }
    if (dtd == null) {
      dtd = DTD.getDTD("html32");
    }

    if (filelist != null) {
      indexFileList(filelist);
      return;
    }

    if (crawllist != null) {
      indexCrawlList(crawllist);
      return;
    }
  }

  private void indexFileList(String fileListName) {
    FileInputStream in = null;
    try {
      int counter = 0;
      in = new FileInputStream(fileListName);
      LineInput lines = new LineInput(in);
      String line;
      _indexBuilder = new IndexBuilder(dbName);
      while ((line = lines.readLine()) != null) {
	++counter;
	System.out.println(counter + "\t" + line);
	File file = _directory != null ?
	  new File(_directory, line.trim()) : new File(line.trim());

	if (file.exists())
	  indexDocument(file.toURL().toString(), "file");
	else
	  System.err.println("file doesn't exist: " + file);
      }
      _indexBuilder.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
    finally {
      if (in != null)
	try {
	  in.close();
	} catch (IOException e) { System.err.println(e); }
    }
  }

  private void indexCrawlList(String fileListName) {
    FileInputStream in = null;
    try {
      int counter = 0;
      in = new FileInputStream(fileListName);
      LineInput lines = new LineInput(in);
      String urlString, fileName;
      _indexBuilder = new IndexBuilder(dbName);
      while ((urlString = lines.readLine()) != null) {
	fileName = lines.readLine();
	File file = new File(fileName);
	if (file.exists()) {	// could have been deleted
	  ++counter;
	  System.out.println(counter + "\t" + urlString + ' ' + fileName);
	  indexDocument(file, urlString, "url");
	}
      }
      _indexBuilder.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
    finally {
      if (in != null)
	try {
	  in.close();
	} catch (IOException e) { System.err.println(e); }
    }
  }

  private void indexDocument(String urlString, String title) throws Exception {
    URL url = new URL(urlString);
    Reader in = new InputStreamReader(url.openStream());
    _indexBuilder.openDocument(urlString);
    _parser.parse(in, dtd);
    _indexBuilder.closeDocument(title);
    in.close();
  }

  private void indexDocument(File file, String urlString, String title)
    throws Exception {
      Reader in = new FileReader(file);
      _indexBuilder.openDocument(urlString);
      _parser.parse(in, dtd);
      _indexBuilder.closeDocument(title);
      in.close();
  }

  private void readStoplist(String fromFile) {
    try {
      LineNumberReader in;
      String line;
      in = new LineNumberReader(new FileReader(fromFile));
      while ((line = in.readLine()) != null)
	stoplistAdd(line.trim());
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  public Vector loadFiles(String file, Vector files) {
    File tstfile = new File (file);
    if (tstfile.isDirectory()) {
      String list[] = tstfile.list();
      for (int i=0; i < list.length; i++) {
	files = loadFiles (tstfile.getPath() + 
			   File.separator +
			   list[i], files);
      }
    } else {
      if (file.endsWith(".html") || file.endsWith(".htm")) {
	files.addElement(file);
      }
    }
    return files;
  }
  
  public void indexText(String source, int begin) {
    try {
      BreakIterator boundary = BreakIterator.getWordInstance();
      boundary.setText(source);
      int start = boundary.first();
      for (int end = boundary.next();
	   end != BreakIterator.DONE;
	   start = end, end = boundary.next()) {
	String word = source.substring(start, end).trim().toLowerCase();
	if (word.length() > 1)
	  storeToken(word, begin + start);
	else if (word.length() == 1) {
	  int charType = Character.getType(word.charAt(0));
	  if ((charType == Character.DECIMAL_DIGIT_NUMBER) || 
	      (charType == Character.LETTER_NUMBER) || 
	      (charType == Character.LOWERCASE_LETTER) || 
	      (charType == Character.OTHER_LETTER) || 
	      (charType == Character.OTHER_NUMBER) || 
	      (charType == Character.TITLECASE_LETTER) || 
	      (charType == Character.UNASSIGNED) || 
	      (charType == Character.UPPERCASE_LETTER)) {
	    storeToken(word, begin + start);
	  }
	}
      }
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }

  private void stoplistAdd(String string) {
    _stoplist.put(string, string);
  }
  
  public void storeToken(String token, int begin) throws Exception {
    // !!! hack:  cleaning tokens before Unicode solution
    if (_stoplist.get(token) == null) {
      byte[] bytes = token.getBytes("UTF8");
      if (bytes.length > 250) {
	System.out.println("token dismissed: " + token);
	return;
      }
      for (int i = 0; i < bytes.length; i++)
	if ((bytes[i] & (byte)0x80) != 0) {// high bit
	  //	  System.out.println("token dismissed: " + token);
	  return;
	}
      _indexBuilder.storeLocation(token, begin);
    }
  }
  
  public static void main(String[] args) {
    Indexer indexer = new Indexer();
    try {
      indexer.process(args);
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }
}
