/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
 
  $Id: tagger.cpp,v 1.33 2004/06/21 09:34:35 taku-ku Exp $;

  Copyright (C) 2001-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp>
  This is free software with ABSOLUTELY NO WARRANTY.
  
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/  
#include "viterbi.h"
#include "japanese_tokenizer.h"
#include "common.h"
#include "mutex.h"
#include "param.h"
#include "mecab.h"
#include "stringbuffer.h"
#include "writer.h"
#include "nbest_generator.h"
#include <stdexcept>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

namespace MeCab
{
  static const Option long_options[] = 
    {
      { "rcfile",             'r', 0, "FILE",  "use FILE as resource file" },
      { "dicdir",             'd', 0, "DIR",   "set DIR as dicdir"                        },
      { "build-all-lattice",  'a', 0, 0,       "build all lattice in result (default no)" },
      { "output-format-type", 'O', 0, "TYPE",  "set output format type (wakati,none,...)" },
      { "node-format",        'F', 0, "STR",   "use STR as the user-defined node format"  },
      { "unk-format",         'U', 0, "STR",   "use STR as the user-defined unk format"  },       
      { "bos-format",         'B', 0, "STR",   "use STR as the user-defined bos format"   },
      { "eos-format",         'E', 0, "STR",   "use STR as the user-defined eos format"   },
      { "input-buffer-size",  'b', 0, "INT",   "set input buffer size (default 8192)"     },
      { "nbest",              'N', "1", "INT", "output N best results  (default 1)"     },
      { "output",             'o', 0, "FILE",  "set the output file name"                 },
      { "version",            'v', 0, 0,       "show the version and exit."               },
      { "help",               'h', 0, 0,       "show this help and exit."                 },
      { 0, 0, 0, 0 }
    };


  static std::string getDefaultRc (Param &param)
  {
    std::string rcfile = param.getProfileString ("rcfile");
    if (! rcfile.empty()) return rcfile;

#ifdef HAVE_GETENV
    char *homedir = getenv ("HOME");
    if (homedir) {
      std::string s = MeCab::createFileName (std::string(homedir), ".mecabrc");
      std::ifstream ifs(s.c_str());
      if (ifs) return s;
    }

    char *rcenv = getenv ("MECABRC");
    if (rcenv) return std::string(rcenv);
#endif

#if defined  (_WIN32) && ! defined (__CYGWIN__)
    HKEY hKey;
    char v[1024];
    DWORD vt;
    DWORD size = sizeof (v);
   
    RegOpenKeyEx    (HKEY_CURRENT_USER,"software\\mecab",0,KEY_ALL_ACCESS,&hKey);
    RegQueryValueEx (hKey,"mecabrc",0,&vt,(BYTE *)v,&size);
    RegCloseKey (hKey);
    if (vt == REG_SZ) return std::string (v);
#endif

    return std::string (MECAB_DEFAULT_RC);
  }

  static std::string getDicRc (Param &param, std::string rcpath)
  {
    std::string dicdir = param.getProfileString ("dicdir");
    if (dicdir.empty()) dicdir = "."; // current
    removeFileName (rcpath);
    replaceString (dicdir, "$(rcpath)", rcpath);
    param.setProfile ("dicdir", dicdir.c_str(), true);
    dicdir = createFileName (dicdir, "dicrc");
    return dicdir;
  }

#define TAGGER_INITILIZE  tokenizer(0), build_all_lattice(0)

  class Tagger::Impl 
  {
  private:
    Tokenizer      *tokenizer;
    Viterbi        viterbi;
    Mutex          mutex;
    StringBuffer   ostrs;
    Writer         writer;
    NBestGenerator nbest; 
    const char*    begin;
    bool           build_all_lattice;
    std::string    _what;

    bool       open         (Param &);
    void       getline_parse (std::istream &, std::ostream &,
			      int, char *, unsigned int);

  public:

    bool        open           (int, char**);
    bool        open           (const char*);
    bool        close          ();
    int         parse          (int, char**);
    const char* parse          (const char*, unsigned int = 0);
    const char* parse          (const char*, unsigned int, char*, unsigned int);
    Node*       parseToNode    (const char*, unsigned int = 0);
    const char* parseNBest     (unsigned int, const char*, unsigned int = 0);
    const char* parseNBest     (unsigned int, const char*, unsigned int, char *, unsigned int);

    bool        parseNBestInit (const char*, unsigned int = 0);
    Node*       nextNode  ();
    const char* next();
    const char* next(char*, unsigned int);

    const char* what        ();
    bool        lock        ();
    bool        unlock      (); 

    Impl (): TAGGER_INITILIZE {};

    Impl (int argc, char **argv): TAGGER_INITILIZE
    {
      if (! open (argc, argv)) throw std::runtime_error (_what);
    }

    Impl (const char *arg): TAGGER_INITILIZE
    {
      if (! open (arg)) throw std::runtime_error (_what);
    }

    ~Impl () { this->close (); }
  };

#define TAGGER_ERROR  (std::string (param.what ()) + "\n\n" + COPYRIGHT + "\ntry '--help' for more information.\n")

  const char *Tagger::Impl::what () 
  {
    return _what.c_str();
  }

  bool Tagger::Impl::open (int argc, char **argv)
  {
    Param param;

    if (! param.open (argc, argv, long_options)) {
      _what = TAGGER_ERROR;
      return false;
    }

    return open (param);
  }

  bool Tagger::Impl::open (const char *arg)
  {
    Param param;

    if (! param.open (arg, long_options)) {
      _what = TAGGER_ERROR;       
      return false;
    }

    return open (param);
  }
   
  bool Tagger::Impl::open (Param &param)
  {
    try {

      close ();

      if (param.getProfileInt ("help")) 
	throw std::runtime_error (param.help (long_options));

      if (param.getProfileInt ("version"))
	throw std::runtime_error (param.version (long_options));

      std::string rcfile = getDefaultRc (param);
      if (! param.load (rcfile.c_str())) throw std::runtime_error (TAGGER_ERROR);

      std::string dicrcfile = getDicRc (param, rcfile);
      if (! param.load (dicrcfile.c_str())) throw std::runtime_error (TAGGER_ERROR);

      build_all_lattice = (bool)param.getProfileInt ("build-all-lattice");

      tokenizer = new JapaneseTokenizer (param);

      if (! viterbi.open (param, tokenizer)) throw std::runtime_error (viterbi.what());
      if (! writer.open (param))             throw std::runtime_error (writer.what());
      
      return true;
    }

    catch (exception &e) {
      close ();
      _what = std::string ("Tagger::Impl::open(): ") + e.what ();
      return false;
    }
  }
   
  void Tagger::Impl::getline_parse (std::istream &is,
				    std::ostream &os,
				    int          n,
				    char         *ibuf, 
				    unsigned int ibufsize)

  {
    while (! is.getline (ibuf, ibufsize).eof()) {
      if (is.fail()) {
	std::cerr << "Warning: input-beffer overflow. The line is splitted. use -b #SIZE option.\n";
	is.clear ();
      }
      const char *r = (n >= 2) ? parseNBest (n, ibuf) : parse (ibuf);
      if (! r) throw std::runtime_error (_what);
      os << r << std::flush;
    }
  }

  int Tagger::Impl::parse (int argc, char **argv)
  {
    try {

      Param param;

      if (! param.open (argc, argv, long_options))
	throw std::runtime_error (TAGGER_ERROR);

      if (param.getProfileInt ("help")) {
	std::cout << param.help (long_options);
	return EXIT_SUCCESS;
      }

      if (param.getProfileInt ("version")) {
	std::cout << param.version (long_options);
	return EXIT_SUCCESS;
      }

      int nbest = param.getProfileInt ("nbest");
      if (nbest <= 0 || nbest > NBEST_MAX)
	throw std::runtime_error ("invalid N value");

      if (nbest >= 2) param.setProfile ("build-all-lattice", "1", true);

      if (! open (param)) throw std::runtime_error (_what);

      std::ostream *ofs = &std::cout;
      std::string outputFileName = param.getProfileString ("output");

      if (! outputFileName.empty()) {
	ofs = new std::ofstream (outputFileName.c_str());
	if (! *ofs) throw std::runtime_error (outputFileName + ": no such file or directory");
      }
     
      const std::vector <std::string>& rest = param.getRestArg (); 

      unsigned int ibufsize = _min (MAX_INPUT_BUFFER_SIZE,
				    _max (param.getProfileInt ("input-buffer-size"), MIN_INPUT_BUFFER_SIZE));

      char *ibuf = new char [ibufsize];

      if (rest.size()) {
	for (unsigned int i = 0; i < rest.size(); i++) {
	  std::ifstream ifs (rest[i].c_str ());
	  if (!ifs) throw std::runtime_error (rest[i] + ": no such file or directory");
	  getline_parse (ifs, *ofs, nbest, ibuf, ibufsize);
	}
      } else {
	getline_parse (std::cin, *ofs, nbest, ibuf, ibufsize);
      }

      delete [] ibuf;
  
      if (ofs != &std::cout) delete ofs;

      return EXIT_SUCCESS;
    }

    catch (std::exception &e) {
      std::cerr << "FATAL: " << e.what () << std::endl;
      return EXIT_FAILURE;
    }
  }

  bool Tagger::Impl::close ()
  {
    delete tokenizer; tokenizer = 0;
    build_all_lattice = false;
    return true;
  }

  bool Tagger::Impl::lock ()
  {
    return mutex.lock ();
  }
   
  bool Tagger::Impl::unlock ()
  {
    return mutex.unlock ();
  }

  const char *Tagger::Impl::parse (const char *str, unsigned int len) 
  {
    Node *n = parseToNode (str, len);
    if (!n) return 0;
    ostrs.clear ();
    writer.write (ostrs, str, n);
    ostrs << '\0';
    return ostrs.str ();
  }

  const char *Tagger::Impl::parse (const char *str, unsigned int len, char *out, unsigned int len2)
  {
    Node *n = parseToNode (str, len);
    if (!n) return 0;     
    StringBuffer os (out, len);
    writer.write (os, str, n);
    os << '\0';

    if (! os.str ()) {
      _what = "Tagger::Impl::parse (): output buffer overflow" ;
      return 0;
    }

    return ostrs.str ();
  }

  Node *Tagger::Impl::parseToNode (const char *str, unsigned int len) 
  {
    if (!str) {
      _what = "Tagger::Impl::parseToNode (): NULL pointer is given";
      return 0;
    }
     
    Node *bosNode = viterbi.analyze (str, len ? len : strlen (str));
    if (! bosNode) {
      _what = std::string("Tagger::Impl::parseToNode (): ") + viterbi.what ();
      return 0;
    }

    return bosNode;
  }

  bool Tagger::Impl::parseNBestInit (const char *str, unsigned int len)   
  {
    if (! build_all_lattice) {
      _what = "Tagger::Impl::parseNBestInit (): use -a option to obtain N-Best results";
      return 0;
    }
    Node *n = parseToNode (str, len);
    begin = str;
    if (! n) return false;
    nbest.set (n);
    return true;
  }

  Node* Tagger::Impl::nextNode ()
  {
    Node *n = nbest.next ();

    if (! n) {
      _what = "Tagger::Impl::nextNode (): no more results";
      return 0;
    }

    return n;
  }

  const char* Tagger::Impl::next ()
  {
    Node *n = nextNode ();
    if (! n) return 0;
    ostrs.clear ();
    writer.write (ostrs, (const char *)begin, n);
    ostrs << '\0';
    return ostrs.str ();
  }

  const char* Tagger::Impl::next (char *out, unsigned int len2)
  {
    Node *n = nextNode ();
    if (! n) return 0;
    StringBuffer os (out, len2);
    writer.write (ostrs, (const char *)begin, n);
    os << '\0';

    if (! os.str ()) {
      _what = "Tagger::Impl::next (): output buffer overflow" ;
      return 0;
    }
    return ostrs.str ();
  }


  const char* Tagger::Impl::parseNBest (unsigned int N, const char* str, unsigned int len)
  {
    if (N == 1) return parse (str, len);

    if (! parseNBestInit (str, len)) return 0;
    ostrs.clear ();

    for (unsigned int i = 0; i < N; ++i) {
      Node *n = nextNode ();
      if (! n) break;
      writer.write (ostrs, str, n);
    }

    ostrs << '\0';
    return ostrs.str ();
  }
   
  const char* Tagger::Impl::parseNBest (unsigned int N, const char* str, unsigned int len, 
				  char *out, unsigned int len2)
  {
    if (N == 1) return parse (str, len, out, len2);

    if (! parseNBestInit (str, len)) return 0;
    StringBuffer os (out, len2);

    for (unsigned int i = 0; i < N; ++i) {
      Node *n = nextNode ();
      if (! n) break;
      writer.write (os, str, n);
    }
    os << '\0';
     
    if (! os.str ()) {
      _what = "Tagger::Impl::parseNBest (): output buffer overflow" ;
      return 0;
    }
     
    return os.str ();
  }   

  Tagger::Tagger  (): _impl(new Impl()) {};
  Tagger::Tagger  (int argc, char** argv) : _impl(new Impl(argc, argv)) {};
  Tagger::Tagger  (const char* argv):       _impl(new Impl (argv))      {};
  Tagger::~Tagger () { delete _impl; _impl = 0; }

  bool        Tagger::open           (int argc, char** argv) { return _impl->open(argc, argv); }
  bool        Tagger::open           (const char* arg) { return _impl->open(arg); }
  bool        Tagger::close          () { return _impl->close(); }
  int         Tagger::parse          (int argc, char**argv)  { return _impl->parse(argc, argv); }
  const char* Tagger::parse          (const char* str, unsigned int len) { return _impl->parse(str,len); }
  const char* Tagger::parse          (const char* str, unsigned int len, char* out, unsigned int len2) { return _impl->parse(str,len,out,len2); }
  Node*       Tagger::parseToNode    (const char* str, unsigned int len) { return _impl->parseToNode(str, len); }
  const char* Tagger::parseNBest     (unsigned int n, const char* str, unsigned int len) { return _impl->parseNBest(n, str, len); }
  const char* Tagger::parseNBest     (unsigned int n, const char* str, unsigned int len, char *out, unsigned int len2) 
                                     { return _impl->parseNBest(n, str, len, out, len2); }

  bool        Tagger::parseNBestInit (const char* str, unsigned int len) { return _impl->parseNBestInit(str, len); }
  Node*       Tagger::nextNode  () { return _impl->nextNode(); }
  const char* Tagger::next() { return _impl->next(); }
  const char* Tagger::next(char* str, unsigned int len) { return _impl->next(str, len); }

  const char* Tagger::what        () { return _impl->what(); }
  bool        Tagger::lock        () { return _impl->lock(); }
  bool        Tagger::unlock      () { return _impl->unlock(); } 
}
