///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// $Id: MessageFactory.cc,v 1.44 2004/01/14 17:38:40 bburton Exp $
//
// Copyright (C) 2000 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//

#include <set>
#include <strstream>
#include "FrequencyDB.h"
#include "Tokenizer.h"
#include "MessageFactory.h"
#include "RegularExpression.h"
#include "PhraseBuilder.h"
#include "ProximityPhraseBuilder.h"

static const char *SKIPPED_HEADERS[] = {
  "x-spam",
  "x-razor",
  "status",
  "x-status",
  "x-imap",
  0
};

const string MessageFactory::EXTRA_HEADER_SUFFIX(":x");
const string MessageFactory::PREFIX_0("H");
const string MessageFactory::PREFIX_1("_");
static const string URL_PREFIX("U_");
static const string TAG_PREFIX("T_");
static const string CHARSET_PREFIX("CS_");
static const char PREFIX_NON_ALNUM = '-';
static const int MAX_PREFIX_LEN = 128;
static const string SINGLE_SPACE(" ");
static const string IP_ADDRESS_TOKEN("IP_ADDRESS");

MessageFactory::MessageFactory()
  : m_minWordLength(1),
    m_maxWordLength(90),
    m_replaceNonAsciiChars(true),
    m_nonAsciiChar('z'),
    m_removeHTML(true),
    m_keepSuspiciousTags(false),
    m_ignoreBody(false),
    m_headersToInclude(NORMAL_HEADERS),
    m_phraser(new PhraseBuilder(2))
{
}

MessageFactory::~MessageFactory()
{
}

void MessageFactory::setMaxPhraseChars(int length)
{
  m_phraser->setMaxChars(length);
}

void MessageFactory::setMaxPhraseLength(int length)
{
  m_phraser->setMaxWords(length);
}

void MessageFactory::setMinPhraseLength(int length)
{
  m_phraser->setMinWords(length);
}

static bool is_skipped_header(const string &line)
{
  for (const char **s = SKIPPED_HEADERS; *s; ++s) {
    if (starts_with(line, *s)) {
      return true;
    }
  }
  return false;
}

void MessageFactory::useProximityPhraser()
{
  m_phraser.set(new ProximityPhraseBuilder(m_phraser->getMaxWords()));
}

void MessageFactory::addHeadersToMessage(Message &msg,
                                         MimeMessageReader &reader)
{
  if (m_headersToInclude == NORMAL_HEADERS) {
    addHeaderToMessage("reply-to", msg, reader);
    addHeaderToMessage("sender", msg, reader);
    addHeaderToMessage("originator", msg, reader);
    addHeaderToMessage("subject", msg, reader);
    addHeaderToMessage("from", msg, reader);
    addHeaderToMessage("to", msg, reader);
    addHeaderToMessage("cc", msg, reader);
    addHeaderToMessage("message-id", msg, reader);
    addHeaderToMessage("received", msg, reader, true, false);

    string charset;
    if (reader.getCharSet(charset)) {
      addStringToMessage(charset, CHARSET_PREFIX, msg);
    }
  } else if (m_headersToInclude == NO_HEADERS) {
    // do nothing
  } else {
    assert(m_headersToInclude == ALL_HEADERS || m_headersToInclude == NO_X_HEADERS);
    set<string> visited;
    string header_name;
    int header_count = reader.getFieldCount();
    for (int i = 0; i < header_count; ++i) {
      reader.getFieldName(i, header_name);
      if (header_name.size() > 0 && visited.find(header_name) == visited.end()) {
        bool skip = is_skipped_header(header_name);
        if (m_headersToInclude == NO_X_HEADERS && starts_with(header_name, "x-")) {
          skip = true;
        }
        if (!skip) {
          addHeaderToMessage(header_name, msg, reader);
          visited.insert(header_name);
        }
      }
    }
  }

  for (vector<string>::const_iterator i = m_additionalHeaders.begin(); i != m_additionalHeaders.end(); ++i) {
    addHeaderToMessage(*i, msg, reader);
  }
}

void MessageFactory::assignDigestToMessage(Message &msg,
                                           MimeMessageReader &reader)
{
  static RegularExpression digest_expr("^[a-z]+ +[0-9]\\.[0-9]+ +([0-9a-z]+)", 1, true, true);

  string message_digest = reader.getMD5Digest();
  string score_field, digest_value;
  if (m_spamprobeFieldName.length() > 0 &&
      reader.getField(m_spamprobeFieldName, score_field) &&
      digest_expr.match(score_field) &&
      digest_expr.getMatch(1, digest_value).length() == 32) {
    if (is_debug) {
      cerr << "using digest from header" << endl;
    }
    msg.setDigest(digest_value);
  } else {
    msg.setDigest(message_digest);
  }
}

void MessageFactory::initMessage(Message &msg,
                                 MimeMessageReader &reader)
{
  msg.clear();
  string value;
  if (reader.getField("message-id", value)) {
    msg.setID(value);
  }

  addHeadersToMessage(msg, reader);

  string text, content_type;
  while (reader.readText(text, content_type)) {
    addStringToMessage(content_type, EMPTY_STRING, msg);
    addTextToMessage(text, msg);
  }

  assignDigestToMessage(msg, reader);

  if (is_debug) {
    cerr << "loaded message with digest " << msg.getDigest() << endl;
  }
}

string MessageFactory::getHeaderPrefix(const string &header_name)
{
  string prefix;

  map<string,string>::const_iterator prefix_iter = m_prefixedHeaders.find(header_name);
  if (prefix_iter != m_prefixedHeaders.end()) {
    prefix = prefix_iter->second;
  } else {
    prefix += PREFIX_0;
    for (const char *s = header_name.c_str(); *s && prefix.size() < MAX_PREFIX_LEN; ++s) {
      if (is_alnum(*s)) {
        prefix += to_lower(*s);
      } else {
        prefix += PREFIX_NON_ALNUM;
      }
    }
    prefix += PREFIX_1;
  }

  return prefix;
}

void MessageFactory::addTokenToMessage(const string &term,
                                       const string &prefix,
                                       Message &msg)
{
  if (prefix.length() > 0 || !m_ignoreBody) {
    msg.addToken(term, prefix);
  }
}

void MessageFactory::addHeaderToMessage(const string &name,
                                        Message &msg,
                                        MimeMessageReader &reader,
                                        bool tag_extra_headers,
                                        bool try_decode)
{
  vector<string> text;
  if (reader.getField(name, text)) {
    string header_name(name);
    string prefix(getHeaderPrefix(header_name));
    for (vector<string>::const_iterator i = text.begin(); i != text.end(); ++i) {
      if (try_decode) {
        string text = *i;
        MimeMessageReader::decodeHeader(text);
        addStringToMessage(text, prefix, msg);
      } else {
        addStringToMessage(*i, prefix, msg);
      }

      if (tag_extra_headers) {
        header_name += EXTRA_HEADER_SUFFIX;
        tag_extra_headers = false; // only add suffix once
      }
    }
  }
}

void MessageFactory::addWordPartsToMessage(const string &word,
                                           const string &prefix,
                                           Message &msg)
{
  const char *word_start = word.c_str();
  const char *s = word_start;
  while (*s) {
    while (*s && !is_alnum(*s) && !(*s & 0x80)) {
      ++s;
    }

    bool all_digits = true;
    const char *start = s;
    while (*s && (is_alnum(*s) || (*s & 0x80))) {
      all_digits = all_digits && is_digit(*s);
      ++s;
    }
    const char *end = s;

    if (!all_digits) {
      if (start != word_start) {
        addTokenToMessage(start, prefix, msg);
      }

      if (((end - start) > 1) && *end && !all_digits) {
        addTokenToMessage(string(start, end), prefix, msg);
      }
    }
  }
}

void MessageFactory::addIPAddressTerm(const string &term,
                                      const string &prefix,
                                      Message &msg)
{
  static RegularExpression ip_expr("^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]", 1, true, true);
  if (ip_expr.match(term)) {
    addTokenToMessage(IP_ADDRESS_TOKEN, prefix, msg);
  }
}

void MessageFactory::addWordToMessage(const string &word,
                                      const string &prefix,
                                      Message &msg)
{
  assert(is_lower(word));

  if (is_debug) {
    cerr << "ADDING WORD '" << word << "' PREFIX '" << prefix << "'" << endl;
  }

  string proximity_string;
  if ((word.length() >= m_minWordLength)
      && (word.length() <= m_maxWordLength)
      && !is_digits(word)) {
    m_phraser->addWord(word);
    int word_count = m_phraser->getWordCount();
    for (int i = m_phraser->getMinWords(); i <= word_count; ++i) {
      if (is_debug) {
        cerr << "  PHRASE #" << i << ": " << m_phraser->getPhrase(i) << endl;
      }
      addTokenToMessage(m_phraser->getPhrase(i), prefix, msg);
    }
    addWordPartsToMessage(word, prefix, msg);
    addIPAddressTerm(word, prefix, msg);
  } else {
    m_phraser->clear();
  }
}

void MessageFactory::addStringToMessage(const string &value,
                                        const string &prefix,
                                        Message &msg)
{
  m_phraser->clear();

  Tokenizer tokenizer;
  for (const char *s = value.c_str(); *s; ++s) {
    char ch = (m_replaceNonAsciiChars && (*s & 0x80)) ? m_nonAsciiChar : *s;
    if (tokenizer.addChar(ch)) {
      addWordToMessage(tokenizer.getWord(), prefix, msg);
    }
  }

  if (tokenizer.stop()) {
    addWordToMessage(tokenizer.getWord(), prefix, msg);
  }
}

bool MessageFactory::isHTML(const string &text)
{
  static RegularExpression html_expr("</?html>|<p>|<br>|</?tr>|</?td>|</?font |</?b>|<a ",
                                     1, true, true);
  bool is_html = html_expr.match(text);
  if (is_debug) {
    cerr << "** HTML? " << is_html << endl;
    html_expr.dumpMatches(cerr);
  }
  return is_html;
}

void MessageFactory::addTextToMessage(const string &value,
                                      Message &msg)
{
  string text = value;
  if (isHTML(text)) {
    if (m_removeHTML) {
      removeHTMLFromText(text, msg);
    } else {
      expandEntitiesInHTML(text);
    }
  }
  addStringToMessage(text, EMPTY_STRING, msg);
}

static void add_entity(string &text,
                       const string &entity)
{
  assert(is_lower(entity));

  if (entity == "amp") {
    text += '&';
  } else if (entity == "apos") {
    text += '\'';
  } else if (entity == "quot") {
    text += '"';
  } else if (entity == "lt") {
    text += '<';
  } else if (entity == "gt") {
    text += '>';
  } else if (entity == "nbsp") {
    text += ' ';
  } else if (entity[0] == '#') {
    int code = 0;
    istrstream in(entity.c_str() + 1);
    in >> code;
    text += safe_char(code);
  } else {
    text += entity;
  }
}

void MessageFactory::expandEntitiesInHTML(string &text)
{
  bool changed = false;
  bool in_amp = false;
  string amp, new_text;
  for (const char *s = text.c_str(); *s; ++s) {
    if (in_amp) {
      if (*s == ';') {
        add_entity(new_text, amp);
        in_amp = false;
      } else {
        amp += to_lower(*s);
      }
    } else if (*s == '&') {
      changed = true;
      in_amp = true;
      amp.erase();
    } else if (is_space(*s) && (*s != ' ')) {
      changed = true;
      new_text += ' ';
    } else {
      new_text += *s;
    }
  }

  assert(changed || (text == new_text));
  if (changed) {
    text = new_text;
  }
}

void MessageFactory::expandCharsInURL(string &url)
{
  string answer;
  for (const char *s = url.c_str(); *s; ++s) {
    if (*s == '%' && is_xdigit(s[1]) && is_xdigit(s[2])) {
      answer += (char)(hex_to_int(s[1]) << 4 | hex_to_int(s[2]));
      s += 2;
    } else {
      answer += *s;
    }
  }
  url = answer;
}

bool MessageFactory::isSpaceTag(const string &tag)
{
  static RegularExpression space_expr("(^/?(br|p|th|td|tr) )|(^/?(br|p|th|td|tr)/?$)", 1, true, true);
  bool answer = space_expr.match(tag);
  if (answer && is_debug) {
    cerr << "FOUND SPACE TAG: " << tag << endl;
    space_expr.dumpMatches(cerr);
  }
  return answer;
}

bool MessageFactory::isSuspiciousTag(const string &tag)
{
  static RegularExpression suspicious_expr("^font", 1, true, true);
  bool answer = suspicious_expr.match(tag);
  if (answer) {
    if (is_debug) {
      cerr << "FOUND SUSPICIOUS TAG: " << tag << endl;
    }
  }
  return answer;
}

bool MessageFactory::processUrls(string &tag,
                                 Message &msg)
{
  static RegularExpression url_expr("[^a-z0-9_](href|src)[ \t\r\n]*=[ \t\r\n]*('[^>' \t\r\n]+|\"[^>\" \t\r\n]+|[^> \t\r\n]+)",
                                    2, true, false);
  if (is_debug) {
    cerr << "LOOK FOR URL: " << tag << endl;
  }

  bool answer = false;
  while (url_expr.match(tag)) {
    string url;
    url_expr.getMatch(2, url);
    if (is_debug) {
      url_expr.dumpMatches(cerr);
    }
    expandCharsInURL(url);
    addStringToMessage(url, URL_PREFIX, msg);
    addStringToMessage(url, EMPTY_STRING, msg);
    url_expr.replaceMatch(0, tag, SINGLE_SPACE);
    answer = true;
  }
  return answer;
}

void MessageFactory::processTag(string tag,
                                Message &msg)
{
  expandEntitiesInHTML(tag);

  bool is_suspicious = isSuspiciousTag(tag);

  if (processUrls(tag, msg)) {
    is_suspicious = true;
  }

  if (m_keepSuspiciousTags && is_suspicious) {
    addStringToMessage(tag, TAG_PREFIX, msg);
  }
}

void MessageFactory::removeHTMLFromText(string &text,
                                        Message &msg)
{
  bool changed = false;
  string new_text;
  string amp, tag;
  bool in_tag = false;
  bool in_amp = false;
  for (const char *s = text.c_str(); *s; ++s) {
    if (in_tag) {
      if (*s == '>') {
        in_tag = false;
        processTag(tag, msg);
        if (isSpaceTag(tag)) {
          new_text += ' ';
        }
      } else if (is_space(*s)) {
        tag += ' ';
      } else {
        tag += *s;
      }
    } else if (*s == '<') {
      in_tag = true;
      in_amp = false;
      tag.erase();
      changed = true;
    } else if (*s == '&') {
      in_amp = true;
      amp.erase();
      changed = true;
    } else if (is_space(*s) && (*s != ' ')) {
      changed = true;
      new_text += ' ';
    } else if (in_amp) {
      if (*s == ';') {
        add_entity(new_text, amp);
        in_amp = false;
      } else {
        amp += to_lower(*s);
      }
    } else {
      new_text += *s;
    }
  }

  assert(changed || (text == new_text));
  if (changed) {
    text = new_text;
  }
}
