Version française
Home     About     Download     Resources     Contact us    
Browse thread
Correct way of programming a CGI script
[ Home ] [ Index: by date | by threads ]
[ Search: ]

[ Message by date: previous | next ] [ Message in thread: previous | next ] [ Thread: previous | next ]
Date: -- (:)
From: Julien Moutinho <julien.moutinho@g...>
Subject: Re: Warning on home-made functions dealing with UTF-8.
Here, I have reused some old code of mine to secure and extend J.Skaller's:
  unicode_of_utf8 ~ parse_utf8
  utf8_of_unicode ~ utf8_of_int
May it help, and may it not be too bugged.

exception Bad_utf8 of string * (string * int * int * int)
  (* raised with an error description and its location:
   *   bytes
   *   start (0 < start <= String.length bytes)
   *   size (0 < size <= String.length bytes)
   *   position (0 <= position <= size) *)
exception Insufficient of int
  (* raised when more bytes are needed.
   * The absolute value of the integer is the minimal amount of bytes needed.
   * A positive sign means that they have to be appended.
   * A negative sign means that they have to be prepended. *)

let in_bounds
  ~(size: int)
  ~(pos: int) =
    if size <> 0 then begin
        if pos < 0 then begin
            let i = size - ((- pos) mod size) in
            if i = size then 0 else i
        end else (pos mod size)
    end else 0

let position__char_size__offset
  (bytes: string)
  ?(start = 0)
  ?(size  = String.length bytes)
  ~(pos: int) : int * int * int =
    if size <= 0 then (0, 0, 0)
    else begin
        let pos        = in_bounds ~size ~pos      in
        let char_pos   = start + pos               in
        let char_start = ref char_pos              in
        let on_tail    = ref true                  in
        let loc        = (bytes, start, size, pos) in
        
        (* go backward to find a head *)
        while !on_tail do
            if char_pos - !char_start > 3
            then raise (Bad_utf8 ("cannot find a head nearby", loc))
            else if !char_start < start
            then raise (Insufficient (-1))
            else begin
                let cod = Char.code bytes.[!char_start] in
                if (cod land 0b1100_0000) = 0b1000_0000 (* on a trailing byte *)
                then decr char_start
                else on_tail := false
            end
        done;
        let char_start = !char_start in
        
        (* decode the head *)
        let head = Char.code bytes.[char_start] in
        let overlong boo =
            (* check for overlong forms (when a character uses more trailing bytes than needed),
             * see http://en.wikipedia.org/wiki/UTF-8#Overlong_forms.2C_invalid_input.2C_and_security_considerations *)
            if boo then raise (Bad_utf8 ("overlong form", loc))
        in
        let may_be_overlong = ref false in
        let char_size = (* get the size of the character *)
            (* 0zzzzzzz -> 0zzzzzzz                                              = 7  bits *)
            if      (head land 0b1_0000000) = 0b0_0000000 then      1
            (* 110YYYYy 10zzzzzz -> 00000yyy yyzzzzzz                            = 11 bits *)
            else if (head land 0b111_00000) = 0b110_00000
            then (overlong ((head land 0b000_11110) = 0);           2)
            (* 1110XXXX 10Yyyyyy 10zzzzzz -> xxxxyyyy yyzzzzzz                   = 16 bits *)
            else if (head land 0b1111_0000) = 0b1110_0000
            then (may_be_overlong := ((head land 0b0000_1111) = 0); 3)
            (* 11110WWW 10XXxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz = 21 bits *)
            else if (head land 0b1111_1000) = 0b1111_0000
            then (may_be_overlong := ((head land 0b00000_111) = 0); 4)
            (* 4 bytes is the maximun size of an UTF-8 character by now *)
            else raise (Bad_utf8 ("invalid head", loc))
        in
        
        (* decode the tail *)
        let off      = ref (char_start + 1)   in
        let t_end    = start + size           in
        let char_end = char_start + char_size in
        let max_off  = min char_end t_end     in
        (* check whether the trailing bytes of a character
         * are of the form 0b10_xxxxxx *)
        while !off < max_off do
            let cod = (Char.code bytes.[!off]) in
            if (cod land 0b11_000000) <> 0b10_000000
            then raise (Bad_utf8 ("invalid tail", loc));
            incr off
        done;
        (* complete the overlong check *)
        if max_off >= char_start + 1 (* if there is a second byte *)
        && !may_be_overlong
        then overlong
          (  (char_size = 3
          && ((Char.code bytes.[char_start + 1]) land 0b00_100000) = 0)
          || (char_size = 4
          && ((Char.code bytes.[char_start + 1]) land 0b00_110000) = 0) );
        (* check the tail length *)
        if char_end > t_end
        then raise (Insufficient (char_end - (char_pos + 1)));
        
        (pos, char_size, char_pos - char_start)
    end

let unicode_of_utf8
  (bytes: string)
  ?(start = 0)
  ?(size  = String.length bytes)
  (pos: int) : int * int =
    let pos, char_size, offset =
      position__char_size__offset bytes ~start ~size ~pos in
    let char_start = pos - offset in
    let unicode =
        match char_size with
        | 1 -> (* 0zzzzzzz -> 0zzzzzzz *)
            Char.code bytes.[char_start]
        | 2 -> (* 110yyyyy 10zzzzzz -> 00000yyy yyzzzzzz *)
            let cod0 = Char.code bytes.[char_start]     in
            let cod1 = Char.code bytes.[char_start + 1]
            in  ((cod0 land 0b000_11111) lsl 6)
            lor  (cod1 land 0b00_111111)
        | 3 -> (* 1110xxxx 10yyyyyy 10zzzzzz -> xxxxyyyy yyzzzzzz *)
            let cod0 = Char.code bytes.[char_start]     in
            let cod1 = Char.code bytes.[char_start + 1] in
            let cod2 = Char.code bytes.[char_start + 2]
            in  ((cod0 land 0b0000_1111) lsl 12)
            lor ((cod1 land 0b00_111111) lsl 6)
            lor  (cod2 land 0b00_111111)
        | 4 -> (* 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz *)
            let cod0 = Char.code bytes.[char_start]     in
            let cod1 = Char.code bytes.[char_start + 1] in
            let cod2 = Char.code bytes.[char_start + 2] in
            let cod3 = Char.code bytes.[char_start + 3]
            in  ((cod0 land 0b00000_111) lsl 18)
            lor ((cod1 land 0b00_111111) lsl 12)
            lor ((cod2 land 0b00_111111) lsl 6)
            lor  (cod3 land 0b00_111111)
        | _ -> assert false
    in
    match unicode with
    | cod when cod >= 0xD800 && cod <= 0xDFFF ->
        (* The definition of UTF-8 prohibits encoding character numbers between
         * U+D800 and U+DFFF, which are reserved for use with the UTF-16
         * encoding form (as surrogate pairs) and do not directly represent characters. *)
        raise (Bad_utf8 ("prohibited code point", (bytes, start, size, pos)))
    | cod when cod > 0x10FFFF ->
        raise (Bad_utf8 ("invalid code point", (bytes, start, size, pos)))
    | _ -> (unicode, (char_size - offset))

exception Bad_unicode of string * int
  (* raised with an error description and an integer
   * which is either a prohibited or an invalid unicode code point *)

let utf8_of_unicode :
  int -> string =
    function
    | cod when cod >= 0x00 && cod <= 0x7F -> (* 0zzzzzzz -> 0zzzzzzz *)
        String.make 1 (Char.chr cod)
    | cod when cod <= 0x07FF -> (* 00000yyy yyzzzzzz -> 110yyyyy 10zzzzzz *)
        let str = String.create 2 in
        str.[0] <- Char.chr (0b110_00000 lor  (cod lsr 6));
        str.[1] <- Char.chr (0b10_000000 lor  (cod         land 0b00_111111));
        str
    | cod when cod >= 0xD800 && cod <= 0xDFFF ->
        (* The definition of UTF-8 prohibits encoding character numbers between
         * U+D800 and U+DFFF, which are reserved for use with the UTF-16
         * encoding form (as surrogate pairs) and do not directly represent characters. *)
        raise (Bad_unicode ("prohibited code point", cod))
    | cod when cod <= 0xFFFF -> (* xxxxyyyy yyzzzzzz -> 1110xxxx 10yyyyyy 10zzzzzz *)
        let str = String.create 3 in
        str.[0] <- Char.chr (0b1110_0000 lor  (cod lsr 12));
        str.[1] <- Char.chr (0b10_000000 lor ((cod lsr 6)  land 0b00_111111));
        str.[2] <- Char.chr (0b10_000000 lor ( cod         land 0b00_111111));
        str
    | cod when cod <= 0x10FFFF -> (* 000wwwxx xxxxyyyy yyzzzzzz -> 11110www 10xxxxxx 10yyyyyy 10zzzzzz *)
        let str = String.create 4 in
        str.[0] <- Char.chr (0b11110_000 lor ( cod lsr 18));
        str.[1] <- Char.chr (0b10_000000 lor ((cod lsr 12) land 0b00_111111));
        str.[2] <- Char.chr (0b10_000000 lor ((cod lsr 6)  land 0b00_111111));
        str.[3] <- Char.chr (0b10_000000 lor ( cod         land 0b00_111111));
        str
    | cod -> raise (Bad_unicode ("invalid code point", cod))