(* $Id: netconversion.mli,v 2.7 2003/06/03 18:58:23 stolpmann Exp $
 * ----------------------------------------------------------------------
 *)

exception Malformed_code
  (* Raised when an illegal byte sequence is found *)

exception Cannot_represent of int
  (* Raised when a certain Unicode code point cannot be represented in
   * the selected output encoding
   *)


(* PREFACE
 *
 * - With the exception of UTF-8 and UTF-16, only single-byte character sets
 *   are supported.
 * - I took the mappings from www.unicode.org, and the standard names of
 *   the character sets from IANA. Obviously, many character sets are missing
 *   that can be supported; especially ISO646 character sets, many EBCDIC 
 *   code pages. 
 * - Because of the copyright statement from Unicode, I cannot put the
 *   source tables that describe the mappings into the distribution. They
 *   are publicly available from www.unicode.org.
 * - Because of this, it is difficult for you to extend the list of character 
 *   sets; you need the source tables I am not allowed to distribute.
 *   These tables have a very simple format: Every line describes a pair
 *   of code points; the left code (<= 0xff) is the code in the character
 *   set, the right code (<= 0xffff) is the Unicode equivalent.
 *   For an example, see
 *   http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT
 *   You can send me such files, and I will integrate them into the 
 *   distribution (if possible).
 * - I really do not know very much about the character sets used in
 *   East Asia. If you need them, please write the necessary conversion
 *   functions and send them to me.
 *)

(* About the encodings:
 * 
 * Supported range of code points: 0 to 0xd7ff, 0xe000 to 0xfffd,
 * 0x10000 to 0x10ffff. This is effectively Unicode.
 *
 * `Enc_utf8: Malformed UTF-8 byte sequences are always rejected. This
 * is also true for the sequence 0xc0 0x80 which is used by some software
 * (Java) as paraphrase for the code point 0. 
 *
 * `Enc_java: The same as `Enc_utf8 except that the code point 0 is
 * represented as 0xc0 0c80, and not as 0x00.
 *
 * `Enc_utf16: When reading from a string encoded as `Enc_utf16, a byte
 * order mark is expected at the beginning. The detected variant 
 * (`Enc_utf16_le or `Enc_utf16_be) is usually returned by the parsing
 * function. The byte order mark is not included into the output string. - 
 * It is not possible to write as Enc_utf16. - Many functions of this
 * module cannot cope with `Enc_utf16, and will fail.
 *
 * `Enc_utf16_le, `Enc_utf16_be: When reading from such a string, the
 * code point 0xfeff is returned as it is; it is a "zero-width 
 * non-breaking space" in this case, and not a byte order mark. The code 
 * point 0xfffe is rejected.
 *
 * Surrogate pairs: These are recognized (or written) only for
 * UTF-16-style encodings; and rejected for any other encoding.
 *
 * `Enc_subset(e,def): This means the same encoding as e, but only a 
 * subset of the code points can be represented. The function def is called,
 * and only if it returns "true" the code point is ok. If an `Enc_subset
 * is used as input encoding, Malformed_code will be raised for all code
 * points for which def returns "false". If an `Enc_subset is used as
 * output encoding, the "subst" function is called for all code point
 * for which def returns "false".
 *
 * In general, Rejected byte sequences cause the exception Malformed_code.
 *
 * LINKING:
 *
 * `Enc_utf8, `Enc_java, `Enc_utf16, `Enc_utf16_le, `Enc_utf16_be, and
 * `Enc_iso88591 are hardwired in the code and thus always available.
 *
 * Whether the other encodings can be used depends on how the application
 * was linked. See the file INSTALL for details.
 *
 *
 * KNOWN PROBLEMS:
 * - The following charsets do not have a bijective mapping to Unicode:
 *   adobe_standard_encoding, adobe_symbol_encoding, 
 *   adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation
 *   simply removes one of the conflicting code point pairs - this might
 *   not what you want.
 * - Japanese encodings: 
 *   * JIS X 0208: The character 1/32 is mapped to 0xFF3C, and not
 *     to 0x005C.
 *)

(* NAMING:
 *
 * - Labels called "range_pos" and "range_len" refer to byte positions of
 *   characters, or substrings
 * - Labels called "count" refer to positions given as the number of characters
 *   relative to an origin
 *
 * - A "uchar" is a single Unicode code point represented as int
 * - A "ustring" is a string of encoded characters
 * - A "uarray" is an array of int representing a string
 *)



type encoding =
  [  `Enc_utf8       (* UTF-8 *)
  |  `Enc_java       (* The variant of UTF-8 used by Java *)
  |  `Enc_utf16      (* UTF-16 with unspecified endianess (restricted usage) *)
  |  `Enc_utf16_le   (* UTF-16 little endian *)
  |  `Enc_utf16_be   (* UTF-16 big endian *)
  |  `Enc_usascii    (* US-ASCII (only 7 bit) *)
  |  `Enc_iso88591   (* ISO-8859-1 *)
  |  `Enc_iso88592   (* ISO-8859-2 *)
  |  `Enc_iso88593   (* ISO-8859-3 *)
  |  `Enc_iso88594   (* ISO-8859-4 *)
  |  `Enc_iso88595   (* ISO-8859-5 *)
  |  `Enc_iso88596   (* ISO-8859-6 *)
  |  `Enc_iso88597   (* ISO-8859-7 *)
  |  `Enc_iso88598   (* ISO-8859-8 *)
  |  `Enc_iso88599   (* ISO-8859-9 *)
  |  `Enc_iso885910  (* ISO-8859-10 *)
  |  `Enc_iso885911  (* ISO-8859-11 *)
  |  `Enc_iso885913  (* ISO-8859-13 *)
  |  `Enc_iso885914  (* ISO-8859-14 *)
  |  `Enc_iso885915  (* ISO-8859-15 *)
  |  `Enc_iso885916  (* ISO-8859-16 *)
  |  `Enc_koi8r      (* KOI8-R *)
  |  `Enc_jis0201    (* JIS-X-0201 (Roman in lower half; Katakana upper half *)
  |  `Enc_eucjp      (* EUC-JP (includes US-ASCII, JIS-X-0201, -0208, -0212) *)
    (* Japanese, TODO: *)
(*|  `Enc_iso2022jp of jis_state = [ `Enc_usascii | `Enc_jis0201 |
                                     `Enc_jis0208_1978 | `Enc_jis0208_1893 ]
      It is very likely that ISO-2022 will be handled in a different module.
      This encoding is too weird.
  |  `Enc_sjis
*)
    (* Microsoft: *)
  |  `Enc_windows1250  (* WINDOWS-1250 *)
  |  `Enc_windows1251  (* WINDOWS-1251 *)
  |  `Enc_windows1252  (* WINDOWS-1252 *)
  |  `Enc_windows1253  (* WINDOWS-1253 *)
  |  `Enc_windows1254  (* WINDOWS-1254 *)
  |  `Enc_windows1255  (* WINDOWS-1255 *)
  |  `Enc_windows1256  (* WINDOWS-1256 *)
  |  `Enc_windows1257  (* WINDOWS-1257 *)
  |  `Enc_windows1258  (* WINDOWS-1258 *)
    (* IBM, ASCII-based: *)
  |  `Enc_cp437
  |  `Enc_cp737
  |  `Enc_cp775
  |  `Enc_cp850
  |  `Enc_cp852
  |  `Enc_cp855
  |  `Enc_cp856
  |  `Enc_cp857
  |  `Enc_cp860
  |  `Enc_cp861
  |  `Enc_cp862
  |  `Enc_cp863
  |  `Enc_cp864
  |  `Enc_cp865
  |  `Enc_cp866
  |  `Enc_cp869
  |  `Enc_cp874
  |  `Enc_cp1006
   (* IBM, EBCDIC-based: *)
  |  `Enc_cp037
  |  `Enc_cp424
  |  `Enc_cp500
  |  `Enc_cp875
  |  `Enc_cp1026
   (* Adobe: *)
  |  `Enc_adobe_standard_encoding
  |  `Enc_adobe_symbol_encoding
  |  `Enc_adobe_zapf_dingbats_encoding
   (* Apple: *)
  |  `Enc_macroman
   (* Encoding subset: *)
  |  `Enc_subset of (encoding * (int -> bool))
  |  `Enc_empty     (* does not encode any character *)

  ]


(* A "character set" is simply a set of code points. It does not say how
 * the code points are encoded as bytes. Every encoding implies a certain
 * charset (or several charsets) that can be encoded, but the reverse is 
 * not true.
 *)

type charset =
  [  `Set_unicode    (* The full Unicode repertoire *)
  |  `Set_usascii    (* US-ASCII (only 7 bit) *)
  |  `Set_iso88591   (* ISO-8859-1 *)
  |  `Set_iso88592   (* ISO-8859-2 *)
  |  `Set_iso88593   (* ISO-8859-3 *)
  |  `Set_iso88594   (* ISO-8859-4 *)
  |  `Set_iso88595   (* ISO-8859-5 *)
  |  `Set_iso88596   (* ISO-8859-6 *)
  |  `Set_iso88597   (* ISO-8859-7 *)
  |  `Set_iso88598   (* ISO-8859-8 *)
  |  `Set_iso88599   (* ISO-8859-9 *)
  |  `Set_iso885910  (* ISO-8859-10 *)
  |  `Set_iso885911  (* ISO-8859-11 *)
  |  `Set_iso885913  (* ISO-8859-13 *)
  |  `Set_iso885914  (* ISO-8859-14 *)
  |  `Set_iso885915  (* ISO-8859-15 *)
  |  `Set_iso885916  (* ISO-8859-16 *)
  |  `Set_koi8r      (* KOI8-R *)
  |  `Set_jis0201    (* JIS-X-0201 *)
  |  `Set_jis0208    (* JIS-X-0208 *)
  |  `Set_jis0212    (* JIS-X-0212 *)
    (* Microsoft: *)
  |  `Set_windows1250  (* WINDOWS-1250 *)
  |  `Set_windows1251  (* WINDOWS-1251 *)
  |  `Set_windows1252  (* WINDOWS-1252 *)
  |  `Set_windows1253  (* WINDOWS-1253 *)
  |  `Set_windows1254  (* WINDOWS-1254 *)
  |  `Set_windows1255  (* WINDOWS-1255 *)
  |  `Set_windows1256  (* WINDOWS-1256 *)
  |  `Set_windows1257  (* WINDOWS-1257 *)
  |  `Set_windows1258  (* WINDOWS-1258 *)
    (* IBM, ASCII-based: *)
  |  `Set_cp437
  |  `Set_cp737
  |  `Set_cp775
  |  `Set_cp850
  |  `Set_cp852
  |  `Set_cp855
  |  `Set_cp856
  |  `Set_cp857
  |  `Set_cp860
  |  `Set_cp861
  |  `Set_cp862
  |  `Set_cp863
  |  `Set_cp864
  |  `Set_cp865
  |  `Set_cp866
  |  `Set_cp869
  |  `Set_cp874
  |  `Set_cp1006
   (* IBM, EBCDIC-based: *)
  |  `Set_cp037
  |  `Set_cp424
  |  `Set_cp500
  |  `Set_cp875
  |  `Set_cp1026
   (* Adobe: *)
  |  `Set_adobe_standard_encoding
  |  `Set_adobe_symbol_encoding
  |  `Set_adobe_zapf_dingbats_encoding
   (* Apple: *)
  |  `Set_macroman
  ]


(* Pre-evaluation of the encoding argument:
 * 
 * A number of the following functions can be made run faster if they are
 * called several times for the same encoding. In this case, it is recommended
 * to apply the function once partially with the encoding argument, and to
 * call the resulting closure instead. For example, ustring_of_uchar supports
 * this technique:
 *
 *   let my_ustring_of_uchar = ustring_of_uchar my_enc in
 *   let s1 = my_ustring_of_uchar u1 ...
 *   let s2 = my_ustring_of_uchar u2 ...
 *
 * This is faster than
 *
 *   let s1 = ustring_of_uchar my_enc u1 ...
 *   let s2 = ustring_of_uchar my_enc u2 ...
 *
 * The availability of this optimization is indicated by the statement
 * "PRE_EVAL(encoding)".
 *)


val encoding_of_string : string -> encoding;;
    (* Returns the encoding of the name of the encoding. Fails if the 
     * encoding is unknown.
     * E.g. encoding_of_string "iso-8859-1" = `Enc_iso88591
     *)


val string_of_encoding : encoding -> string;;
    (* Returns the name of the encoding. *)


val is_ascii_compatible : encoding -> bool;;
    (* "ASCII compatible" means: The bytes 1 to 127 represent the ASCII
     * codes 1 to 127, and no other representation of a character contains
     * the bytes 1 to 127.
     * For example, ISO-8859-1 is ASCII-compatible because the byte 1 to
     * 127 mean the same as in ASCII, and all other characters use bytes
     * greater than 127. UTF-8 is ASCII-compatible for the same reasons,
     * it does not matter that there are multi-byte characters.
     * EBCDIC is not ASCII-compatible because the bytes 1 to 127 do not mean
     * the same as in ASCII. UTF-16 is not ASCII-compatible because the bytes
     * 1 to 127 can occur in multi-byte representations of non-ASCII
     * characters.
     * The byte 0 has been excluded from this definition because the C
     * language uses it with a special meaning that has nothing to do with
     * characters, so it is questionable to interpret the byte 0 anyway.
     *)


val is_single_byte : encoding -> bool
  (* Returns whether the encoding is a single-byte encoding *)


val same_encoding : encoding -> encoding -> bool
  (* Whether both encodings are the same. `Enc_subset encodings are only
   * equal when the definition functions are physically the same.
   *
   * Warning: Don't use ( = ) to compare encodings because this may
   * fail.
   *)


val byte_order_mark : encoding -> string
  (* Returns the byte order mark that must occur at the beginning of
   * files to indicate whether "little endian" or "big endian" is used.
   * If this does not apply to the encoding, an empty string is returned.
   *)


val makechar : encoding -> int -> string
  (* DEPRECATED since 0.96 - use ustring_of_uchar instead! *)
  (* makechar enc i:
   * Creates the string representing the Unicode code point i in encoding enc.
   * Raises Not_found if the character is legal but cannot be represented 
   * in enc.
   * 
   * Possible encodings: everything but `Enc_utf16.
   *
   * Further hints:
   * - PRE_EVAL(encoding)
   *)


val ustring_of_uchar : encoding -> int -> string
  (* ustring_of_uchar enc i:
   * Creates the string representing the Unicode code point i in encoding enc.
   * Raises Cannot_represent i if the character is legal but cannot be 
   * represented in enc.
   * 
   * Possible encodings: everything but `Enc_utf16.
   *
   * Further hints:
   * - PRE_EVAL(encoding)
   *)


val to_unicode : charset -> int -> int
  (* Maps the code point of the charset to the corresponding 
   * Unicode code point, or raises Malformed_code.
   *
   * Note `Set_jis0208 and `Set_jis0212: The numeric code point is computed
   * by multiplying the row number (1..94) with 96, and by adding the
   * column number (1..94).
   *
   * Further hints:
   * - PRE_EVAL(charset)
   *)


val from_unicode : charset -> int -> int
  (* Maps the Unicode code point to the corresponding code point of
   * the charset, or raises Cannot_represent.
   *
   * Note `Set_jis0208 and `Set_jis0212: The numeric code point is computed
   * by multiplying the row number (1..94) with 96, and by adding the
   * column number (1..94), i.e.
   *    code_point = row * 96 + column
   *
   * Further hints:
   * - PRE_EVAL(charset)
   *)


val available_input_encodings : unit -> encoding list
  (* Returns the list of all available encodings that can be used for
   * input strings. The list reflects the set of loaded/linked Netmapping
   * modules.
   *)


val available_output_encodings : unit -> encoding list
  (* Returns the list of all available encodings that can be used for
   * output strings. The list reflects the set of loaded/linked Netmapping
   * modules.
   *)



(**********************************************************************)
(* Conversion between character encodings                             *)
(**********************************************************************)


val convert : ?subst:(int -> string) ->
              in_enc:encoding -> 
              out_enc:encoding ->
              ?range_pos:int -> ?range_len:int ->
	      string ->
                string 
  (* Converts the string from in_enc to out_enc, and returns it.
   *
   * The function subst is invoked for code points of in_enc that cannot
   * be represented in out_enc, and the result of the function invocation
   * is substituted.
   * Restriction: The string returned by subst must not be longer than 50
   * bytes.
   * If subst is missing, Cannot_represent is raised in this case.
   *
   * range_pos and range_len can be used to select a substring for conversion.
   * range_pos is the byte position of the beginning of the substring, and 
   * range_len its length in bytes. range_pos defaults to 0, and range_len 
   * defaults to the length of the rest of the string.
   *
   * The converted string must consist of a whole number of characters.
   * It is illegal when the string contains the prefix of a multi-byte
   * character. The exception Malformed_code is raised in this case.
   *)


val recode_string : in_enc:encoding -> 
                    out_enc:encoding ->
		    ?subst:(int -> string) ->
		    string ->
                    string 
  (* OBSOLETE since 0.96, use "convert" instead *)
  (* Recodes a complete string from in_enc to out_enc, and returns it.
   * The function subst is invoked for code points of in_enc that cannot
   * be represented in out_enc, and the result of the function invocation
   * is substituted.
   * Restriction: The string returned by subst must not be longer than 50
   * bytes.
   * If subst is missing, Not_found is raised in this case.
   *)


val recode : in_enc:encoding -> 
             in_buf:string -> 
	     in_pos:int ->
	     in_len:int -> 
	     out_enc:encoding -> 
	     out_buf:string -> 
	     out_pos:int ->
	     out_len:int ->
	     max_chars:int ->
             subst:(int -> string) -> (int * int * encoding)
  (* 
   * let (in_n, out_n, in_enc') = 
   *     recode in_enc in_buf in_len out_enc out_buf out_pos out_len max_chars 
   *            subst:
   * Converts the character sequence contained in the at most in_len bytes
   * of in_buf starting at position in_pos, and writes the result 
   * into at most out_len bytes of out_buf starting at out_pos.
   * At most max_chars are written into out_buf.
   *
   * The characters in in_buf are assumed to be encoded as in_enc, and the 
   * characters in out_buf will be encoded as out_enc.
   *
   * If there is a code point which cannot be represented in out_enc,
   * the function subst is called with the code point as argument, and the
   * resulting string (which must already be encoded as out_enc) is
   * inserted instead. 
   *
   * Note: It is possible that subst is called several times for the same
   * character.
   *
   * Restriction: The string returned by subst must not be longer than 50
   * bytes.
   *
   * Return values: 
   * - out_n is the actual number of bytes written into out_buf.
   * - in_n is the actual number of bytes that have been converted from
   *   in_buf; in_n may be smaller than in_len because of incomplete
   *   multi-byte characters, or because the output buffer has less space
   *   for characters than the input buffer, or because of a change
   *   of the encoding variant.
   *
   *   If there is at least one complete character in in_buf, and at least
   *   space for one complete character in out_buf, and max_chars >= 1, it is 
   *   guaranteed that in_n > 0 or out_n > 0.
   * - in_enc' is normally identical to in_enc. However, there are cases
   *   in which the encoding can be refined when looking at the byte
   *   sequence; for example whether a little endian or big endian variant
   *   of the encoding is used. in_enc' is the variant of in_enc that was
   *   used for the last character that has been converted.
   *)


class conversion_pipe : 
        ?subst:(int -> string) ->
        in_enc:encoding -> 
	out_enc:encoding -> 
	unit ->
	  Netchannels.io_obj_channel
  (* This pipeline class (see Netchannels for more information) can be used
   * to recode a netchannel while reading or writing.
   *
   * EXAMPLE: Convert ISO-8859-1 to UTF-8 while writing to the file
   * "output.txt":
   * 
   * let ch = new output_channel (open_out "output.txt") in
   * let encoder = 
   *   new conversion_pipe ~in_enc:`Enc_iso88591 ~out_enc:`Enc_utf8 () in
   * let ch' = new output_filter encoder ch in
   * ... (* write to ch' *)
   * ch' # close_out();
   * ch  # close_out();  (* you must close both channels! *)
   *)


class recoding_pipe : 
        ?subst:(int -> string) ->
        in_enc:encoding -> 
	out_enc:encoding -> 
	unit ->
	  Netchannels.io_obj_channel
  (* OBSOLETE since 0.96, use conversion_pipe instead *)
  (* Difference to conversion_pipe: subst raises Not_found by default,
   * and not Cannot_represent.
   *)

(**********************************************************************)
(* Cursors                                                            *)
(**********************************************************************)

type cursor
  (* A cursor denotes a character position in a string *)

exception End_of_string
  (* Raised when it is tried to access the character after the end of the
   * string.
   *)

exception Cursor_out_of_range
  (* Raised when it is tried to move the cursor beyond the beginning of the
   * string or beyond the end of the string. In the latter case, it is
   * legal to move the cursor to the position following the last character,
   * but it is not possible to move it further.
   *)


val create_cursor : ?range_pos:int -> ?range_len:int -> 
                    ?initial_rel_pos:int -> 
                    encoding -> string -> cursor
  (* Creates a new cursor for the passed string and the passed encoding.
   * By default, the cursor can move over the whole string, and the
   * cursor is intially positioned at the beginning of the string.
   *
   * ~range_pos and ~range_len: The cursor can be restricted to this
   * substring (both numbers count bytes). The position ~range_pos
   * is the logical beginning of the range that can be addressed by the
   * cursor, and ~range_pos+~range_len is the logical end of this
   * range. The default for ~range_pos is 0, and for ~range_len the
   * length until the end of the string.
   *
   * ~initial_rel_pos: The initial byte position of the cursor. This position
   * is given in bytes relative to ~range_pos. The character at this position
   * will be considered as the zeroth character of the string (as reported
   * by cursor_char_count).
   *
   * Restriction: It is not possible to create a cursor for the `Enc_utf16
   * encoding. `Enc_utf16_le and `Enc_utf16_be work, though.
   *)

val copy_cursor : ?enc:encoding -> cursor -> cursor
  (* Copies the cursor. The copy can be moved independently of the original
   * cursor, but is applied to the same string. Optionally, the assumed
   * encoding can be changed to a different one.
   *)

val cursor_target : cursor -> string
  (* Returns the string of the cursor *)

val cursor_range : cursor -> (int * int)
  (* Returns the valid range of the cursor (~range_pos, ~range_len) *)

val cursor_initial_rel_pos : cursor -> int
  (* Returns the initial relative byte position of the cursor *)

val cursor_char_count : cursor -> int
  (* Returns the character count of the cursor. The initial position
   * (when create_cursor was called) has the number 0, positions to the
   * right positive numbers, and positions to the left negative numbers.
   *)

val cursor_pos : cursor -> int
  (* Returns the byte position of the cursor, i.e. the byte index of
   * the string that corresponds to the cursor position. The function
   * returns the absolute position (i.e. NOT relative to cursor_range).
   *)

val uchar_at : cursor -> int
  (* Returns the Unicode code point of the character at the cursor.
   * Raises End_of_string if the end of the string is reached. 
   *)

val cursor_byte_length : cursor -> int
  (* Returns the byte length of the representation of the character at the
   * cursor.
   * Raises End_of_string if the end of the string is reached. 
   *)

val cursor_at_end : cursor -> bool
  (* Returns whether the cursor is positioned at then end of the
   * string.
   *)

val move : ?num:int -> cursor -> unit
  (* Moves the cursor one character to the right, of if ~num is passed,
   * this number of characters to the right. ~num can be negative in
   * which case the cursor is moved to the left.
   *
   * If the cursor were placed outside the valid range, the exception
   * Cursor_out_of_range would be raised, and the cursor moves to the
   * leftmost or rightmost position (depending on the direction).
   *)

val cursor_encoding : cursor -> encoding
  (* Returns the encoding of the cursor. *)

(**********************************************************************)
(* String functions                                                   *)
(**********************************************************************)

val ustring_length : 
        encoding -> ?range_pos:int -> ?range_len:int -> string -> int
  (* Returns the length of the string in characters 
   *
   * Further hints:
   * - PRE_EVAL(encoding)
   *
   * ~range_pos, ~range_len: Select the substring of the string to measure.
   *    These positions are byte positions.
   *)

val ustring_iter : 
       encoding ->
       (int -> unit) ->
       ?range_pos:int -> ?range_len:int ->
       string ->
	 unit
  (* Iterates over the characters of a string, and calls the passed function
   * for every code point.
   *
   * encoding: specifies the encoding
   * ~range_pos, ~range_len: Restrict the range of the string to iterate.
   *   These positions are byte positions.
   *)

val ustring_map :
       encoding ->
       (int -> int list) ->
       ?range_pos:int -> ?range_len:int ->
       string ->
	 string
  (* Maps every character of a string to list of characters, and returns
   * the concatenated string. 
   * The [encoding] argument determines the encoding of both the argument
   * and the result string.
   * The map function gets the character as its Unicode code point, and
   * must return the list of code points to map to.
   *
   * ~range_pos, ~range_len: Select the substring of the string to map.
   *    These positions are byte positions.
   *) 

val ustring_sub :
       encoding ->
       int ->
       int ->
       ?range_pos:int -> ?range_len:int ->
       string ->
	 string
  (* ustring_sub enc start length s: Returns the substring of s starting
   * at character count [start] and consisting of [length] characters. Note
   * that [start] and [length] select the substring by multiples of
   * (usually multibyte) characters, not bytes.
   *
   * ~range_pos, ~range_len: Restrict the range of the string to consider.
   *   These positions are byte positions.
   *)

val ustring_compare :
      encoding ->
      (int -> int -> int) ->
       ?range_pos:int -> ?range_len:int ->
      string ->
       ?range_pos:int -> ?range_len:int ->
      string ->
	int
  (* Compares two strings lexicographically. The first argument is the
   * encoding of both strings (which must be the same). The second argument
   * is the function that compares two Unicode code points. It must return
   * 0 if both characters are the same, a negative value if the first
   * character is the smaller one, and a positive value if the second
   * character is the smaller one.
   *
   * ~range_pos, ~range_len: Select a substring of the following string
   *   argument. These positions are byte positions.
   *)

val uarray_of_ustring : 
    encoding -> 
    ?range_pos:int -> ?range_len:int ->
    string -> 
      int array

  (* Returns the characters of the string as array of Unicode code points.
   * 
   * ~range_pos, ~range_len: Select a substring of the following string
   *   argument. These positions are byte positions.
   *)


val ustring_of_uarray :
    ?subst:(int -> string) ->
    encoding ->
    ?pos:int -> ?len:int ->
    int array ->
      string

  (* Returns the array of Unicode code points as encoded string.
   * 
   * ~pos, ~len: Select a subarray of the array
   *
   * ~subst: This function is called when a code point cannot be represented
   *   in the chosen character encoding. It must returns the (already encoded)
   *   string to substitute for this code point. By default (if ~subst is
   *   not passed), the exception Cannot_represent will be raised in this
   *   case.
   *)

exception Malformed_code_at of int

val verify : encoding -> ?range_pos:int -> ?range_len:int -> string -> unit
  (* Checks whether the string is properly encoded. If so, () is returned.
   * If not, the exception Malformed_code_at will be raised indicating 
   * the byte position where the problem occurs.
   *)


(* ======================================================================
 * History:
 * 
 * $Log: netconversion.mli,v $
 * Revision 2.7  2003/06/03 18:58:23  stolpmann
 * 	Fine-tuning
 *
 * Revision 2.6  2003/04/21 22:01:36  stolpmann
 * 	Change: subst functions raise Cannot_represent by default (only
 * in new functions)
 * 	Obsolete: recode_string, recoding_pipe, makechar
 * 	New: same_encoding, byte_order_mark, uchar_string, from_unicode,
 * available_input_encodings, available_output_encodings, convert,
 * conversion_pipe
 * 	New: cursor implementation
 * 	New: unicode string functions
 *
 * Revision 2.5  2002/07/06 16:23:24  stolpmann
 * 	Fix: It is now checked and documented that the substitution
 * strings must not be too long. The limit is multibyte_limit, currently
 * 50 bytes.
 *
 * Revision 2.4  2002/07/03 01:15:57  stolpmann
 * 	New: `Enc_subset
 * 	New: is_single_byte
 * 	New: to_unicode
 *
 * Revision 2.3  2002/06/23 20:58:59  stolpmann
 * 	Added: Support for ISO-8858-16 character encoding
 *
 * Revision 2.2  2002/06/23 19:47:03  stolpmann
 * 	New class: recoding_pipe.
 * 	Fix: A possible problem in recode_string has been solved.
 * 	Improved representation of character mappings.
 *
 * Revision 2.1  2001/09/14 14:22:34  stolpmann
 * 	Initial revision (sourceforge)
 *
 *
 * ======================================================================
 * Revision 1.1  2000/08/13 00:02:57  gerd
 * 	Initial revision.
 *
 *
 * ======================================================================
 * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.mli):
 *
 * Revision 1.4  2000/07/04 22:05:58  gerd
 * 	Enhanced version of 'recode'. Labeled arguments.
 * New function 'recode_string'.
 *
 * Revision 1.3  2000/05/29 23:48:38  gerd
 * 	Changed module names:
 * 		Markup_aux          into Pxp_aux
 * 		Markup_codewriter   into Pxp_codewriter
 * 		Markup_document     into Pxp_document
 * 		Markup_dtd          into Pxp_dtd
 * 		Markup_entity       into Pxp_entity
 * 		Markup_lexer_types  into Pxp_lexer_types
 * 		Markup_reader       into Pxp_reader
 * 		Markup_types        into Pxp_types
 * 		Markup_yacc         into Pxp_yacc
 * See directory "compatibility" for (almost) compatible wrappers emulating
 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
 *
 * Revision 1.2  2000/05/29 21:14:57  gerd
 * 	Changed the type 'encoding' into a polymorphic variant.
 *
 * Revision 1.1  2000/05/20 20:30:50  gerd
 * 	Initial revision.
 *
 * 
 *)
