// --------------------------------------------------------------------
// PDF parsing
// --------------------------------------------------------------------
/*

    This file is part of the extensible drawing editor Ipe.
    Copyright (C) 1993-2007  Otfried Cheong

    Ipe is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    As a special exception, you have permission to link Ipe with the
    CGAL library and distribute executables, as long as you follow the
    requirements of the Gnu General Public License in regard to all of
    the software in the executable aside from CGAL.

    Ipe is distributed in the hope that it will be useful, but WITHOUT
    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
    License for more details.

    You should have received a copy of the GNU General Public License
    along with Ipe; if not, you can find it at
    "http://www.gnu.org/copyleft/gpl.html", or write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

*/

#include "ipepdfparser.h"
#include "ipeutils.h"

//------------------------------------------------------------------------

// A '1' in this array means the character is white space.
// A '1' or '2' means the character ends a name or command.
// '2' == () {} [] <> / %
static char specialChars[256] = {
  1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
  1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
};

// --------------------------------------------------------------------

inline int ToInt(IpeString &s)
{
  return std::strtol(s.CString(), 0, 10);
}

inline double ToDouble(IpeString &s)
{
  return std::strtod(s.CString(), 0);
}

// --------------------------------------------------------------------

/*! \class IpePdfObj
 * \ingroup base
 * \brief Abstract base class for PDF objects.
 */

//! Pure virtual destructor.
IpePdfObj::~IpePdfObj()
{
  // nothing
}

//! Return this object as PDF null object.
const IpePdfNull *IpePdfObj::Null() const { return 0; }

//! Return this object as PDF bool object.
const IpePdfBool *IpePdfObj::Bool() const { return 0; }

//! Return this object as PDF number object.
const IpePdfNumber *IpePdfObj::Number() const { return 0; }

//! Return this object as PDF string object.
const IpePdfString *IpePdfObj::String() const { return 0; }

//! Return this object as PDF name object.
const IpePdfName *IpePdfObj::Name() const { return 0; }

//! Return this object as PDF reference object.
const IpePdfRef *IpePdfObj::Ref() const { return 0; }

//! Return this object as PDF array object.
const IpePdfArray *IpePdfObj::Array() const { return 0; }

//! Return this object as PDF dictionary object.
const IpePdfDict *IpePdfObj::Dict() const { return 0; }

//! Return PDF representation of the object.
IpeString IpePdfObj::Repr() const
{
  IpeString d;
  IpeStringStream ss(d);
  Write(ss);
  return d;
}

/*! \class IpePdfNull
 * \ingroup base
 * \brief The PDF null object.
 */
const IpePdfNull *IpePdfNull::Null() const { return this; }

void IpePdfNull::Write(IpeStream &stream) const
{
  stream << "null";
}

/*! \class IpePdfBool
 * \ingroup base
 * \brief The PDF bool object.
 */
const IpePdfBool *IpePdfBool::Bool() const { return this; }

void IpePdfBool::Write(IpeStream &stream) const
{
  stream << (iValue ? "true" : "false");
}

/*! \class IpePdfNumber
 * \ingroup base
 * \brief The PDF number object.
 */
const IpePdfNumber *IpePdfNumber::Number() const { return this; }

void IpePdfNumber::Write(IpeStream &stream) const
{
  stream << iValue;
}

/*! \class IpePdfString
 * \ingroup base
 * \brief The PDF string object.
 */
const IpePdfString *IpePdfString::String() const { return this; }

void IpePdfString::Write(IpeStream &stream) const
{
  stream << "(" << iValue << ")";
}

/*! \class IpePdfName
 * \ingroup base
 * \brief The PDF name object.
 */
const IpePdfName *IpePdfName::Name() const { return this; }

void IpePdfName::Write(IpeStream &stream) const
{
  stream << "/" << iValue;
}

/*! \class IpePdfRef
 * \ingroup base
 * \brief The PDF reference object (indirect object).
 */
const IpePdfRef *IpePdfRef::Ref() const { return this; }

void IpePdfRef::Write(IpeStream &stream) const
{
  stream << iValue << " 0 R";
}

/*! \class IpePdfArray
 * \ingroup base
 * \brief The PDF array object.
 */
const IpePdfArray *IpePdfArray::Array() const { return this; }

void IpePdfArray::Write(IpeStream &stream) const
{
  stream << "[";
  IpeString sep = "";
  for (int i = 0; i < Count(); ++i) {
    stream << sep;
    sep = " ";
    Obj(i, 0)->Write(stream);
  }
  stream << "]";
}

IpePdfArray::~IpePdfArray()
{
  for (std::vector<const IpePdfObj *>::iterator it = iObjects.begin();
       it != iObjects.end(); ++it) {
    delete *it;
    *it = 0;
  }
}

//! Append an object to array.
/*! Array takes ownership of the object. */
void IpePdfArray::Append(const IpePdfObj *obj)
{
  iObjects.push_back(obj);
}

//! Return object with \a index in array.
/*! Indirect objects (references) are looked up if \a file is not
  zero, and the object referred to is returned (0 if it does not
  exist).  Object remains owned by array.
*/
const IpePdfObj *IpePdfArray::Obj(int index, const IpePdfFile *file) const
{
  const IpePdfObj *obj = iObjects[index];
  if (file && obj->Ref()) {
    int n = obj->Ref()->Value();
    return file->Object(n);
  }
  return obj;
}

/*! \class IpePdfDict
 * \ingroup base
 * \brief The PDF dictionary and stream objects.

 A dictionary may or may not have attached stream data.
 */

const IpePdfDict *IpePdfDict::Dict() const { return this; }

void IpePdfDict::Write(IpeStream &stream) const
{
  IpeString sep = "<<";
  for (std::vector<Item>::const_iterator it = iItems.begin();
       it != iItems.end(); ++it) {
    stream << sep;
    sep = " ";
    stream << "/" << it->iKey << " ";
    it->iVal->Write(stream);
  }
  stream << ">>";
  if (iStream.size() > 0) {
    stream << "\nstream\n";
    for (int i = 0; i < iStream.size(); ++i)
      stream.PutChar(iStream[i]);
    stream << "\nendstream";
  }
}

IpePdfDict::~IpePdfDict()
{
  for (std::vector<Item>::iterator it = iItems.begin();
       it != iItems.end(); ++it) {
    delete it->iVal;
    it->iVal = 0;
  }
}

//! Add stream data to this dictionary.
void IpePdfDict::SetStream(const IpeBuffer &stream)
{
  iStream = stream;
}

//! Add a (key, value) pair to the dictionary.
/*! Dictionary takes ownership of \a obj. */
void IpePdfDict::Add(IpeString key, const IpePdfObj *obj)
{
  Item item;
  item.iKey = key;
  item.iVal = obj;
  iItems.push_back(item);
}

//! Look up key in dictionary.
/*! Indirect objects (references) are looked up if \a file is not zero,
  and the object referred to is returned.
  Returns 0 if key is not in dictionary.
*/
const IpePdfObj *IpePdfDict::Get(IpeString key, const IpePdfFile *file) const
{
  for (std::vector<Item>::const_iterator it = iItems.begin();
       it != iItems.end(); ++it) {
    if (it->iKey == key) {
      if (file && it->iVal->Ref())
	return file->Object(it->iVal->Ref()->Value());
      else
	return it->iVal;
    }
  }
  return 0; // not in dictionary
}

//! Is this stream compressed with flate compression?
bool IpePdfDict::Deflated() const
{
  const IpePdfObj *f = Get("Filter", 0);
  return !(!f || !f->Name() || f->Name()->Value() != "FlateDecode");
}

#if 0
//! Return the (uncompressed) stream data.
/*! This only handles the /Flate compression. */
IpeBuffer IpePdfDict::Inflate() const
{
  if (iStream.size() == 0)
    return iStream;
  const IpePdfObj *f = Get("Filter", 0);
  if (!f || !f->Name() || f->Name()->Value() != "FlateDecode")
    return iStream;

  IpeString dest;

  IpeBufferSource bsource(iStream);
  IpeInflateSource source(bsource);

  int ch = source.GetChar();
  while (ch != EOF) {
    dest += char(ch);
    ch = source.GetChar();
  }
  return IpeBuffer(dest.data(), dest.size());
}
#endif

// --------------------------------------------------------------------

/*! \class IpePdfParser
 * \ingroup base
 * \brief PDF parser

 The parser understands the syntax of PDF files, but very little of
 its semantics.  It is meant to be able to parse PDF documents created
 by Ipe for loading, and to extract information from PDF files created
 by Pdflatex.

 The parser reads a PDF file sequentially from front to back, ignores
 the contents of 'xref' sections, stores only generation 0 objects,
 and stops after reading the first 'trailer' section (so it cannot
 deal with files with incremental updates).  It cannot handle stream
 objects whose /Length entry has been deferred (using an indirect
 object).

*/

//! Construct with a data source.
IpePdfParser::IpePdfParser(IpeDataSource &source)
  : iSource(source)
{
  iPos = 0;
  GetChar();  // init iCh
  GetToken(); // init iTok
}

//! Skip white space and comments.
void IpePdfParser::SkipWhiteSpace()
{
  while (!Eos() && (specialChars[iCh] == 1 || iCh == '%')) {
    // handle comment
    if (iCh == '%') {
      while (!Eos() && iCh != '\n' && iCh != '\r')
	GetChar();
    }
    GetChar();
  }
}

//! Read the next token from the input stream.
void IpePdfParser::GetToken()
{
  iTok.iString.erase();
  iTok.iType = IpePdfToken::EErr;
  SkipWhiteSpace();
  if (Eos())
    return; // Err

  // parse string
  if (iCh == '(') {
    int nest = 0;
    GetChar();
    while (iCh != ')' || nest > 0) {
      if (Eos())
	return; // Err
      if (iCh == '\\') {
	GetChar();
	if ('0' <= iCh && iCh <= '9') {
	  // octal char code
	  char buf[4];
	  int i = 0;
	  buf[i++] = char(iCh);
	  GetChar();
	  if ('0' <= iCh && iCh <= '9') {
	    buf[i++] = char(iCh);
	    GetChar();
	  }
	  if ('0' <= iCh && iCh <= '9') {
	    buf[i++] = char(iCh);
	    GetChar();
	  }
	  buf[i] = '\0';
	  iTok.iString.append(char(std::strtol(buf, 0, 8)));
	} else {
	  iTok.iString.append(char(iCh));
	  GetChar();
	}
      } else {
	if (iCh == '(')
	  ++nest;
	else if (iCh == ')')
	  --nest;
	iTok.iString.append(char(iCh));
	GetChar();
      }
    }
    GetChar(); // skip closing ')'
    iTok.iType = IpePdfToken::EString;
    return;
  }

  if (iCh == '<') {
    GetChar();
    // recognize dictionary separator "<<"
    if (iCh == '<') {
      GetChar();
      iTok.iType = IpePdfToken::EDictBg;
      return;
    }
    // otherwise it's a binary string <hex>
    while (iCh != '>') {
      if (Eos())
	return; // Err
      iTok.iString.append(char(iCh));
      GetChar();
    }
    // We don't bother to decode it
    GetChar(); // skip '>'
    iTok.iType = IpePdfToken::EString;
    ipeDebug("Found binary string <%s>", iTok.iString.CString());
    return;
  }

  int ch = iCh;

  iTok.iString.append(char(iCh));
  GetChar();

  // recognize array separators
  if (ch == '[') {
    iTok.iType = IpePdfToken::EArrayBg;
    return;
  } else if (ch == ']') {
    iTok.iType = IpePdfToken::EArrayEnd;
    return;
  }

  // recognize dictionary separator ">>"
  if (ch == '>') {
    if (iCh != '>')
      return; // Err
    GetChar();
    iTok.iType = IpePdfToken::EDictEnd;
    return;
  }

  // collect all characters up to white-space or separator
  while (!specialChars[iCh]) {
    if (Eos())
      return; // Err
    iTok.iString.append(char(iCh));
    GetChar();
  }

  if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == '.')
    iTok.iType = IpePdfToken::ENumber;
  else if (ch == '/')
    iTok.iType = IpePdfToken::EName;
  else if (iTok.iString == "null")
    iTok.iType = IpePdfToken::ENull;
  else if (iTok.iString == "true")
    iTok.iType = IpePdfToken::ETrue;
  else if (iTok.iString == "false")
    iTok.iType = IpePdfToken::EFalse;
  else
    iTok.iType = IpePdfToken::EOp;
}

// --------------------------------------------------------------------

//! Parse elements of an array.
IpePdfArray *IpePdfParser::MakeArray()
{
  IpeAutoPtr<IpePdfArray> arr(new IpePdfArray);
  for (;;) {
    if (iTok.iType == IpePdfToken::EArrayEnd) {
      // finish array
      GetToken();
      return arr.Take();
    }
    // check for reference object
    if (iTok.iType == IpePdfToken::ENumber) {
      IpePdfToken t1 = iTok;
      GetToken();
      if (iTok.iType == IpePdfToken::ENumber) {
	IpePdfToken t2 = iTok;
	GetToken();
	if (iTok.iType == IpePdfToken::EOp && iTok.iString == "R") {
	  arr->Append(new IpePdfRef(ToInt(t1.iString)));
	  GetToken();
	} else {
	  arr->Append(new IpePdfNumber(ToDouble(t1.iString)));
	  arr->Append(new IpePdfNumber(ToDouble(t2.iString)));
	}
      } else {
	arr->Append(new IpePdfNumber(ToDouble(t1.iString)));
      }
    } else {
      IpePdfObj *obj = GetObject();
      if (!obj)
	return 0;
      arr->Append(obj);
    }
  }
}

IpePdfDict *IpePdfParser::MakeDict()
{
  IpeAutoPtr<IpePdfDict> dict(new IpePdfDict);
  for (;;) {
    if (iTok.iType == IpePdfToken::EDictEnd) {
      // finish
      GetToken();

      // check whether stream follows
      if (iTok.iType != IpePdfToken::EOp || iTok.iString != "stream")
	return dict.Take();

      // time to read the stream
      while (!Eos() && iCh != '\n')
	GetChar();
      GetChar(); // skip '\n'
      // now at beginning of stream
      const IpePdfObj *len = dict->Get("Length", 0);
      if (!len || !len->Number())
	return 0;
      int bytes = int(len->Number()->Value());
      IpeBuffer buf(bytes);
      char *p = buf.data();
      while (bytes--) {
	*p++ = char(iCh);
	GetChar();
      }
      dict->SetStream(buf);
      GetToken();
      if (iTok.iType != IpePdfToken::EOp || iTok.iString != "endstream")
	return 0;
      GetToken();
      return dict.Take();
    }

    // must read name
    if (iTok.iType != IpePdfToken::EName)
      return 0;
    IpeString name = iTok.iString.substr(1);
    GetToken();

    // check for reference object
    if (iTok.iType == IpePdfToken::ENumber) {
      IpePdfToken t1 = iTok;
      GetToken();
      if (iTok.iType == IpePdfToken::ENumber) {
	IpePdfToken t2 = iTok;
	GetToken();
	if (iTok.iType == IpePdfToken::EOp && iTok.iString == "R") {
	  dict->Add(name, new IpePdfRef(ToInt(t1.iString)));
	  GetToken();
	} else
	  return 0; // should be name or '>>'
      } else
	dict->Add(name, new IpePdfNumber(ToDouble(t1.iString)));
    } else {
      IpePdfObj *obj = GetObject();
      if (!obj)
	return 0;
      dict->Add(name, obj);
    }
  }
}

//! Read one object from input stream.
IpePdfObj *IpePdfParser::GetObject()
{
  IpePdfToken tok = iTok;
  GetToken();

  switch (tok.iType) {
  case IpePdfToken::ENumber:
    return new IpePdfNumber(std::strtod(tok.iString.CString(), 0));
  case IpePdfToken::EString:
    return new IpePdfString(tok.iString);
  case IpePdfToken::EName:
    return new IpePdfName(tok.iString.substr(1));
  case IpePdfToken::ENull:
    return new IpePdfNull;
  case IpePdfToken::ETrue:
    return new IpePdfBool(true);
  case IpePdfToken::EFalse:
    return new IpePdfBool(false);
  case IpePdfToken::EArrayBg:
    return MakeArray();
  case IpePdfToken::EDictBg:
    return MakeDict();
    // anything else is an error
  case IpePdfToken::EErr:
  default:
    return 0;
  }
}

//! Parse an object definition (current token is object number).
IpePdfObj *IpePdfParser::GetObjectDef()
{
  GetToken();
  if (iTok.iType != IpePdfToken::ENumber || iTok.iString != "0")
    return 0;
  GetToken();
  if (iTok.iType != IpePdfToken::EOp || iTok.iString != "obj")
    return 0;
  GetToken();
  IpePdfObj *obj = GetObject();
  if (!obj)
    return 0;
  if (iTok.iType != IpePdfToken::EOp || iTok.iString != "endobj")
    return 0;
  GetToken();
  return obj;
}

//! Skip xref table (current token is 'xref')
void IpePdfParser::SkipXRef()
{
  GetToken(); // first object number
  GetToken(); // number of objects
  int k = ToInt(iTok.iString);
  GetToken();
  while (k--) {
    GetToken(); // obj num
    GetToken(); // gen num
    GetToken(); // n or f
  }
}

//! Parse trailer dictionary (current token is 'trailer')
IpePdfDict *IpePdfParser::GetTrailer()
{
  GetToken();
  if (iTok.iType != IpePdfToken::EDictBg)
    return 0;
  GetToken();
  return MakeDict();
}

// --------------------------------------------------------------------

/*! \class IpePdfFile
 * \ingroup base
 * \brief All information obtained by parsing a PDF file.
 */

//! Create empty container.
IpePdfFile::IpePdfFile()
{
  iTrailer = 0;
}

// Destroy all the objects from the file.
IpePdfFile::~IpePdfFile()
{
  delete iTrailer;
  std::map<int, const IpePdfObj *>::const_iterator it;
  for (it = iObjects.begin(); it != iObjects.end(); ++it) {
    delete it->second;
  }
}

//! Parse entire PDF stream, and store objects.
bool IpePdfFile::Parse(IpeDataSource &source)
{
  IpePdfParser parser(source);

  for (;;) {
    IpePdfToken t = parser.Token();

    if (t.iType == IpePdfToken::ENumber) {
      // <num> 0 obj starts an object
      int num = ToInt(t.iString);
      IpePdfObj *obj = parser.GetObjectDef();
      if (!obj)
	return false;
      iObjects[num] = obj;
    } else if (t.iType == IpePdfToken::EOp) {
      if (t.iString == "trailer") {
	iTrailer = parser.GetTrailer();
	if (!iTrailer)
	  return false;
	return true;
      } else if (t.iString == "xref") {
	parser.SkipXRef();
      } else
	// don't know what's happening
	return false;
    } else
      // don't know what's happening
      return false;
  }
}

//! Return object with number \a num.
const IpePdfObj *IpePdfFile::Object(int num) const
{
  std::map<int, const IpePdfObj *>::const_iterator it =
    iObjects.find(num);
  if (it != iObjects.end())
    return it->second;
  else
    return 0;
}

//! Return root catalog of PDF file.
const IpePdfDict *IpePdfFile::Catalog() const
{
  const IpePdfObj *root = iTrailer->Get("Root", this);
  assert(root && root->Dict());
  return root->Dict();
}

//! Return first page of the document.
const IpePdfDict *IpePdfFile::Page() const
{
  const IpePdfObj *pages = Catalog()->Get("Pages", this);
  assert(pages && pages->Dict());
  const IpePdfObj *kids = pages->Dict()->Get("Kids", this);
  assert(kids);
  if (!kids->Array())
    return 0;
  const IpePdfObj *page = kids->Array()->Obj(0, this);
  // this should be page 1
  if (!page || !page->Dict())
    return 0;
  return page->Dict();
}

// --------------------------------------------------------------------
