(* $Id: neturl.mli,v 1.3 2000/06/26 22:57:49 gerd Exp $
 * ----------------------------------------------------------------------
 *
 *)

(* This module applies already O'Caml-3 features. *)

(* Uniform Resource Locators (URLs):
 *
 * This module provides functions to parse URLs, to print URLs, to
 * store URLs, to modify URLs, and to apply relative URLs.
 *
 * URLs are strings formed according to pattern (1) or (2):
 *
 * (1) scheme://user:password@host:port/path;params?query#fragment
 * (2) scheme:other;params?query#fragment
 *
 * The word at the beginning of the URL identifies the URL scheme
 * (such as "http" or "file"). Depending on the scheme, not all of the
 * parts are allowed, or parts may be omitted. This module defines the
 * type 'url_syntax' whose values describe which parts are allowed/required/
 * not allowed for a concrete URL scheme (see below).
 *
 * Not all characters are allowed in a URL. Some characters are allowed,
 * but have the special task to separate the various parts of the URL
 * (reserved characters).
 * However, it is possible to include even invalid or reserved characters
 * as normal content by applying the '%'-encoding on these characters:
 * A '%' indicates that an encoded character follows, and the character
 * is denoted by a two-digit hexadecimal number (e.g. %2f for '/').
 * In the following descriptions, the term "encoded string" means a string
 * containing such %-encoded characters, and the "decoded string" means a
 * string not containing such characters.
 * See the module Netencoding.Url for functions encoding or decoding
 * strings.
 *
 * The type 'url' describes values storing the components of a URL,
 * and the 'url_syntax' for the URL. In general, the components are
 * stored as encoded strings; however, not for all components the
 * '%'-encoding is applicable.
 * For convenience, the functions creating, modifying, and accessing
 * URLs can handle both encoded and decoded strings. In order to
 * avoid errors, the functions pass strings even in their decoded form.
 *
 * Note that there is currently no function to compare URLs. The
 * canoncical comparison ( = ) is not applicable because the same URL
 * may be written differently.
 *
 * Note that nothing is said about the character set/encoding of URLs.
 * Some protocols and standards prefer UTF-8 as fundamental encoding
 * and apply the '%'-encoding on top of it; i.e. the byte sequence
 * representing a character in UTF-8 is '%'-encoded. There is no special
 * support for this technique.
 *
 * For more information about URLs, see RFCs 1738 and 1808.
 *)

exception Malformed_URL
(* Is raised by a number of functions when encountering a badly formed
 * URL.
 *)

val extract_url_scheme : string -> string
  (* Returns the URL scheme from the string representation of an URL. 
   * E.g. extract_url_scheme "http://host/path" = "http". 
   * The scheme name is always converted to lowercase characters.
   * Raises Malformed_URL if the scheme name is not found.
   *)

type url_syntax_option =
    Url_part_not_recognized
  | Url_part_allowed
  | Url_part_required


type url_syntax =
    { url_enable_scheme    : url_syntax_option;
      url_enable_user      : url_syntax_option;
      url_enable_password  : url_syntax_option;
      url_enable_host      : url_syntax_option;
      url_enable_port      : url_syntax_option;
      url_enable_path      : url_syntax_option;
      url_enable_param     : url_syntax_option;
      url_enable_query     : url_syntax_option;
      url_enable_fragment  : url_syntax_option;
      url_enable_other     : url_syntax_option;
      url_accepts_8bits    : bool;
      url_is_valid         : url -> bool;
    }

and url
;;

(* Values of type 'url_syntax' describe which components of an URL are
 * recognized, which are allowed (and optional), and which are required.
 * Not all combinations are valid; the predicate expressed by the
 * function 'url_syntax_is_valid' must hold.
 * The function 'url_is_valid' is applied when a fresh URL is created
 * and must return 'true'. This function allows it to add an arbitrary
 * validity criterion to 'url_syntax'. (Note that the URL passed to 
 * this function is not fully working; you can safely assume that the
 * accessor functions url_scheme etc. can be applied to it.)
 *
 * Switch 'url_accepts_8bit': If 'true', the bytes with code 128 to
 * 255 are treated like alphanumeric characters; if 'false' these bytes
 * are illegal (but it is still possible to include such byte in their
 * encoded form: %80 to %FF).
 *
 * Values of type 'url' describe concrete URLs. Every URL must have
 * a fundamental 'url_syntax', and it is only possible to create URLs
 * conforming to the syntax. See 'make_url' for further information.
 *)


val url_syntax_is_valid : url_syntax -> bool
  (* Checks whether the passed url_syntax is valid. This means:
   *
   * - If passwords are recognized, users (and hosts) must be recognized, too
   * - If ports are recognized, hosts must be recognized, too
   * - If users are recognized, hosts must be recognized, too
   * - Either the syntax recognizes one of the phrases
   *   { user, password, host, port, path }, or the syntax recognized
   *   the phrase 'other'.
   *)


val partial_url_syntax : url_syntax -> url_syntax
  (* Transforms the syntax into another syntax where all required parts are
   * changed into optional parts.
   *)


(* Note that all following url_syntaxes do not allow 8bit bytes. *)

val null_url_syntax   : url_syntax

val ip_url_syntax : url_syntax
  (* Maximum syntax for IP based protocols *)

val common_url_syntax : (string, url_syntax) Hashtbl.t
  (* Syntax descriptions for common URL schemes:
   * 
   * null_url_syntax: nothing is recognized
   *
   * common_url_syntax: Hashtable mapping from URL scheme names to
   * definitions of syntaxes:
   *
   * "file":   scheme, host?, path
   * "ftp":    scheme, user?, password?, host, port?, path?, param?
   * "http":   scheme, user?, password?, host, port?, path?, query?
   * "mailto": scheme, other
   *
   * Notes:
   * (1) These syntax descriptions can be weakened for partial/relative URLs 
   *     by changing the required parts to optional parts: See the function
   *     'partial_url_syntax'.
   * (2) None of the descriptions allows fragments. These can be enabled by
   *     setting 'url_enable_fragment' to Url_part_allowed. E.g.
   *     { file_url_syntax with url_enable_fragment = Url_part_allowed }
   *)

val null_url : url
  (* A URL without any component and 'null_url_syntax'
   *)

val make_url :
      ?encoded:bool ->
      ?scheme:string ->
      ?user:string ->
      ?password:string ->
      ?host:string ->
      ?port:int ->
      ?path:string list ->
      ?param:string list ->
      ?query:string ->
      ?fragment:string ->
      ?other:string ->
      url_syntax ->
      url
  (* Creates a URL from components:
   * 
   * - The components "scheme" and "host" are simple strings to which the
   *   '%'-encoding is not applicable.
   * - The component "port" is a simple number. Of course, the '%'-encoding
   *   is not applicable, too.
   * - The components "user", "password", "query", "fragment", and "other"
   *   are strings which may contains '%'-encoded characters. By default,
   *   you can pass any string for these components, and problematic characters 
   *   are automatically encoded. If you set ~encoded:true, the passed
   *   strings must already be encoded, but the function checks whether
   *   the encoding is correct.
   *   Note that for "query" even the characters '?' and '=' are encoded
   *   by default, so you need to set ~encoded:true to pass a reasonable
   *   query string.
   * - The components "path" and "param" are lists of strings which may
   *   contain '%'-encoded characters. Again, the default is to pass
   *   decoded strings to the function, and the function encodes them
   *   automatically, and by setting ~encoded:true the caller is responsible
   *   for encoding the strings.
   *   path = [] and params = [] mean that no path and no parameters are
   *   specified, respectively.
   *   See below for the respresentation of these components.
   *
   * Except of "path", the strings representing the components do not
   * contain the characters separating the components from each other. 
   * The "path" component includes the '/' at the beginning of the path
   * (if present).
   *
   * The created URL must conform to the 'url_syntax', i.e.
   * - The URL must only contain components which are recognized by the
   *   syntax
   * - The URL must contain components which are required by the syntax
   * - The URL must fulfill the predicate expressed by the 'url_is_valid'
   *   function of the syntax.
   *
   * The path of a URL is represented as a list of '/'-separated path
   * components. i.e.
   *   [ s1; s2; ...; sN ]  represents the path  
   *                        s1 ^ "/" ^ s2 ^ "/" ^ ... ^ "/" ^ sN
   * As special cases:
   *   []                   is the non-existing path
   *   [ "" ]               is "/"
   *   [ "";"" ]            is illegal
   * 
   * Except of s1 and sN, the path components must not be empty strings.
   *
   * To avoid ambiguities, it is illegal to create URLs with both relative
   * paths (s1 <> "") and host components.
   *
   * Parameters of URLs are components beginning with ';'. The list
   * of parameters is represented as list of strings where the strings
   * contain the value following ';'.
   *)

val modify_url :
      ?syntax:url_syntax ->
      ?encoded:bool ->
      ?scheme:string ->
      ?user:string ->
      ?password:string ->
      ?host:string ->
      ?port:int ->
      ?path:string list ->
      ?param:string list ->
      ?query:string ->
      ?fragment:string ->
      ?other:string ->
      url ->
      url
  (* Modifies the passed components and returns the modified URL. 
   * The modfied URL shares unmodified components with the original
   * URL.
   *)

val remove_from_url :
      ?scheme:bool ->
      ?user:bool ->
      ?password:bool ->
      ?host:bool ->
      ?port:bool ->
      ?path:bool ->
      ?param:bool ->
      ?query:bool ->
      ?fragment:bool ->
      ?other:bool ->
      url ->
      url
  (* Removes the 'true' components from the URL, and returns the modified
   * URL.
   * The modfied URL shares unmodified components with the original
   * URL.
   *)

val default_url :
      ?encoded:bool -> 
      ?scheme:string ->
      ?user:string ->
      ?password:string ->
      ?host:string ->
      ?port:int ->
      ?path:string list ->
      ?param:string list ->
      ?query:string ->
      ?fragment:string ->
      ?other:string ->
      url ->
      url
  (* Adds missing components and returns the modified URL.
   * The modfied URL shares unmodified components with the original
   * URL.
   *)

val undefault_url :
      ?scheme:string ->
      ?user:string ->
      ?password:string ->
      ?host:string ->
      ?port:int ->
      ?path:string list ->
      ?param:string list ->
      ?query:string ->
      ?fragment:string ->
      ?other:string ->
      url ->
      url
  (* Removes components from the URL if they have the passed value, and
   * returns the modified URL.
   * Note: The values must always be passed in _encoded_ form!
   * The modfied URL shares unmodified components with the original
   * URL.
   *)

val url_syntax_of_url : url -> url_syntax
  (* Returns the 'url_syntax' record of a URL. *)

val url_of_string : url_syntax -> string -> url
  (* Parses the passed string according to the passed url_syntax. *)

val string_of_url : url -> string
  (* Returns the URL as string *)

val url_provides :
      ?scheme:bool ->
      ?user:bool ->
      ?password:bool ->
      ?host:bool ->
      ?port:bool ->
      ?path:bool ->
      ?param:bool ->
      ?query:bool ->
      ?fragment:bool ->
      ?other:bool ->
      url ->
      bool
  (* Returns 'true' iff the URL has all of the components passed with
   * 'true' value.
   *)

val url_scheme    :                  url -> string
val url_user      : ?encoded:bool -> url -> string
val url_password  : ?encoded:bool -> url -> string
val url_host      :                  url -> string
val url_port      :                  url -> int
val url_path      : ?encoded:bool -> url -> string list
val url_param     : ?encoded:bool -> url -> string list
val url_query     : ?encoded:bool -> url -> string
val url_fragment  : ?encoded:bool -> url -> string
val url_other     : ?encoded:bool -> url -> string
  (* Return components of the URL. The functions return decoded strings
   * unless ~encoded:true is set.
   * If the component does not exist, the exception Not_found
   * is raised.
   *)

val split_path : string -> string list
  (* Splits a '/'-separated path into components (e.g. to set up the
   * ~path argument of make_url).
   * E.g. split_path "a/b/c" = [ "a"; "b"; "c" ],
   *      split_path "/a/b"  = [ ""; "a"; "b" ],
   *      split_path "a/b/"  = [ "a"; "b"; "" ]
   *)

val join_path : string list -> string
  (* Concatenates the path components (reverse function of split_path).
   *)

val norm_path : string list -> string list
  (* Removes "." and ".." from the path if possible. Deletes double slashes.
   *
   * EXAMPLES:
   *
   * norm_path ["."] = []
   *           means: "." = ""
   * norm_path ["."; ""] = []
   *           means: "./" = ""
   * norm_path ["a"; "."] = ["a"; ""]
   *           means: "a/." = "a/"
   * norm_path ["a"; "b"; "."] = ["a"; "b"; ""]
   *           means: "a/b/." = "a/b/"
   * norm_path ["a"; "."; "b"; "."] = ["a"; "b"; ""]
   *           means: "a/./b/." = "a/b/"
   * norm_path [".."] = [".."; ""]
   *           means: ".." = "../"
   * norm_path [".."; ""] = [".."; ""]
   *           means: "../" = "../"
   * norm_path ["a"; "b"; ".."; "c" ] = ["a"; "c"]
   *           means: "a/b/../c" = "a/c"
   * norm_path ["a"; "b"; ".."; "c"; ""] = ["a"; "c"; ""]
   *           means: "a/b/../c/" = "a/c/"
   * norm_path ["";"";"a";"";"b"] = [""; "a"; "b"]
   *           means: "//a//b" = "/a/b"
   * norm_path ["a"; "b"; ""; ".."; "c"; ""] = ["a"; "c"; ""]
   *           means: "a/b//../c/" = "a/c/"
   * norm_path ["a"; ".."] = []
   *           means: "a/.." = ""
   *)


val apply_relative_url : url -> url -> url
  (* apply_relative_url base rel:
   * Interprets 'rel' relative to 'base' and returns the new URL. This
   * function implements RFC 1808.
   *)

val print_url : url -> unit
  (* Printer for the toploop. *)

(* ---------------------------------------------------------------------- *)

(* EXAMPLES:
 *
 * let http = Hashtbl.find common_url_syntax "http";;
 * let u = url_of_string http "http://g:pw@host/a/%62/";;
 * string_of_url u;;
 *   --> "http://g:pw@host/a/%62/"
 * url_scheme u;;
 *   --> "http"
 * url_user u;;
 *   --> "g"
 * url_password u;;
 *   --> "pw"
 * url_host u;;
 *   --> "host"
 * url_path u;;
 *   --> [ ""; "a"; "b"; "" ]          (* sic! *)
 * url_path ~encoded:true u;;
 *   --> [ ""; "a"; "%62"; "" ]
 * let v = make_url 
 *   ~path:[ ".."; "c" ]
 *   ~fragment:"near-the-#-character"
 *   { (partial_url_syntax http) with url_enable_fragment = Url_part_allowed };;
 * string_of_url v;;
 *   --> "../c#near-the-%23-character"
 * let u' = modify_url ~syntax:(url_syntax_of_url v) u;;
 *    (* u does not permit fragments *)
 * let w = apply_relative_url u' v;;
 * string_of_url w;;
 *   --> "http://g:pw@host/c#near-the-%23-character"
 *)

(* ======================================================================
 * History:
 * 
 * $Log: neturl.mli,v $
 * Revision 1.3  2000/06/26 22:57:49  gerd
 * 	Change: The record 'url_syntax' has an additional component
 * 'url_accepts_8bits'. Setting this option to 'true' causes that
 * the bytes >= 0x80 are no longer rejected.
 *
 * Revision 1.2  2000/06/25 22:55:47  gerd
 * 	Doc update.
 *
 * Revision 1.1  2000/06/24 20:19:59  gerd
 * 	Initial revision.
 *
 * 
 *)
