Version française
Home     About     Download     Resources     Contact us    
Browse thread
Utf8 (and other) code for ocaml (fwd)
[ Home ] [ Index: by date | by threads ]
[ Search: ]

[ Message by date: previous | next ] [ Message in thread: previous | next ] [ Thread: previous | next ]
Date: -- (:)
From: Pierre Weis <Pierre.Weis@i...>
Subject: Utf8 (and other) code for ocaml (fwd)
Dear Caml list readers,

Would you please note that, for security reasons, no attachments are
allowed in messages sent to you from the Caml list. So, please, do not
send messages with attachments to the list, since I'm obliged to send
these messages back, or to remove attachments by hand, as I did for
this message from John Skaller. I will not do this again.

Thank you in advance.

[Pierre Weis as moderator of the Caml list]

----- Forwarded message from skaller -----

>From weis@pauillac.inria.fr  Thu Oct 21 06:52:19 1999
Sender: root@ruby
Message-ID: <380E9AA4.8AEC3665@maxtal.com.au>
Date: Thu, 21 Oct 1999 14:46:28 +1000
From: skaller <skaller@maxtal.com.au>
Organization: Maxtal
X-Mailer: Mozilla 4.51 [en] (X11; I; Linux 2.2.12 i586)
X-Accept-Language: en
MIME-Version: 1.0
CC: caml-list@inria.fr
Subject: Utf8 (and other) code for ocaml
References: <380CB30E.56D1A8A2@maxtal.com.au> <99102100543400.15513@ice>

John Skaller, mailto:skaller@maxtal.com.au
1/10 Toxteth Rd Glebe NSW 2037 Australia
homepage: http://www.maxtal.com.au/~skaller
downloads: http://www.triode.net.au/~skaller

let hexchar_of_int i = 
  if i < 10 
  then char_of_int (i + (int_of_char '0'))
  else char_of_int (i- 10 + (int_of_char 'A'))
;;

let hex4 i = 
  let j = ref i in 
  let s = String.create 4 in
  for k = 0 to 3 do s.[3-k]  <- hexchar_of_int (!j mod 16); j := !j / 16 done;
  s
;;

let escape_of_char quote ch =
  if ch = '\\' then "\\\\"
  else if ch = quote then "\\" ^ (String.make 1 quote)
  else if ch = '\n' then "\\n"
  else if ch < ' ' 
  or ch > char_of_int 126
  then "\\u" ^ (hex4 (Char.code ch))
  else String.make 1 ch
;;

let escape_of_string quote x =
  let esc = escape_of_char quote in
  let res = ref "" in
  for i = 0 to (String.length x -1) do
    res := !res ^ (esc x.[i])
  done;
  (String.make 1) quote ^ !res ^ (String.make 1 quote)
;;

let dquote_of_string = escape_of_string '"';;
let quote_of_string = escape_of_string '\'';;


let hex_char2int s =
  let c = Char.code s in 
  match s with
    _ when (s >= '0' & s <= '9') ->
      c - (Char.code '0') 
  | _ when (s >= 'a' & s <= 'f') ->
      (c - (Char.code 'a')) + 10 
  | _ when (s >= 'A' & s <= 'F') ->
      (c - (Char.code 'A')) + 10 
  | _ -> raise (Py_exceptions.LexError "in hexadecimal character")
;; 

let oct_char2int s =
  let c = Char.code s in 
  match s with
    _ when (s >= '0' & s <= '7') ->
      c - (Char.code '0') 
  | _ -> raise (Py_exceptions.LexError "in octal character")
;; 

let dec_char2int s =
  let c = Char.code s in 
  match s with
    _ when (s >= '0' & s <= '9') ->
      c - (Char.code '0') 
  | _ -> raise (Py_exceptions.LexError "in decimal character")
;; 

let len = String.length;;

let hexint_of_string s =
  let len = len s in
  let value = ref (hex_char2int s.[2]) in 
  for i = 3 to (len - 1) do
    value := !value * 16 + (hex_char2int s.[i])
  done;
  !value
;;

let hexbig_int_of_string s =
  let len = len s in
  let value = ref (Big_int.big_int_of_int (hex_char2int s.[2])) in 
  for i = 3 to (len - 1) do
    value := 
      Big_int.add_int_big_int 
        (hex_char2int s.[i]) 
        (Big_int.mult_int_big_int 16 !value)
  done;
  !value
;;

(* WARNING: THIS CODE WILL NOT WORK FOR THE HIGHER PLANES
  BECAUSE OCAML ONLY SUPPORTS 31 bit signed integers;
  THIS CODE REQUIRES 32 bits [This can be fixed by using 
  negative codes but hasn't been done]
*)

(* parse the first utf8 encoded character of a string s
  starting at index position i, return a pair
  consisting of the decoded integers, and the position 
  of the first character not decoded.

  If the first character is bad, it is returned,
  otherwise if the encoding is bad, the result is
  an unspecified value.

  COMPATIBILITY NOTE: if this function is called
  with a SINGLE character string, it will return
  the usual value for the character, in range
  0 .. 255
*)

let parse_utf8 (s:string)  (i:int) : int * int =
  let ord = int_of_char 
  and n = (String.length s)  - i
  in if n <= 0 then begin print_endline "FAILURE"; (-1),i end
  else let lead = ord (s.[i]) in
    if (lead land 0x80) = 0 then 
      lead land 0x7F,i+1 (* ASCII *)
    else if lead land 0xE0 = 0xC0 && n > 1 then
      ((lead land 0x1F)  lsl  6) lor
        (ord(s.[i+1]) land 0x3F),i+2
    else if lead land 0xF0 = 0xE0 && n > 2 then
      ((lead land 0x1F) lsl 12) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 6) lor
        (ord(s.[i+2]) land 0x3F),i+3
    else if lead land 0xF8 = 0xF0 && n > 3 then
      ((lead land 0x1F) lsl 18) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 6) lor
        (ord(s.[i+3]) land 0x3F),i+4
    else if lead land 0xFC = 0xF8 && n > 4 then
      ((lead land 0x1F) lsl 24) lor 
        ((ord(s.[i+1]) land 0x3F)  lsl 18) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+3]) land 0x3F)  lsl 6) lor
        (ord(s.[i+4]) land 0x3F),i+5
    else if lead land 0xFE = 0xFC && n > 5 then
      ((lead land 0x1F) lsl 30) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 24) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 18) lor
        ((ord(s.[i+3]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+4]) land 0x3F)  lsl 6) lor
        (ord(s.[i+5]) land 0x3F),i+6
    else lead, i+1  (* error, just use bad character *)
;;

(* convert an integer into a utf-8 encoded string of bytes *)
let utf8_of_int i =
  let chr x = String.make 1 (Char.chr x) in
  if i < 0x80 then 
     chr(i)
  else if i < 0x800 then 
     chr(0xC0 lor ((i lsr 6) land 0x1F))  ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x10000 then 
     chr(0xE0 lor ((i lsr 12) land 0xF)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x200000 then 
     chr(0xF0 lor ((i lsr 18) land 0x7)) ^
      chr(0x80 lor ((i lsr 12) land 0x3F)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x4000000 then 
     chr(0xF8 lor ((i lsr 24) land 0x3)) ^
      chr(0x80 lor ((i lsr 18) land 0x3F)) ^
      chr(0x80 lor ((i lsr 12) land 0x3F)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else chr(0xFC lor ((i lsr 30) land 0x1)) ^
    chr(0x80 lor ((i lsr 24) land 0x3F)) ^
    chr(0x80 lor ((i lsr 18) land 0x3F)) ^
    chr(0x80 lor ((i lsr 12) land 0x3F)) ^
    chr(0x80 lor ((i lsr 6) land 0x3F)) ^
    chr(0x80 lor (i land 0x3F))
;;

let unescape s = 
  let hex_limit = 2 in
  let n = len s in
  let s' = ref "" in
  let tack_string ss = s':= !s' ^ ss in 
  let tack_char ch = tack_string (String.make 1 ch) in 
  let tack_utf8 code = tack_string (utf8_of_int code) in
  let i= ref 0 in 
  while !i< n do let ch = s.[!i] in
    if ch = '\\' then begin
      incr i;
      if !i = n then tack_char '\\'
      else match s.[!i] with
      | 'a'  -> tack_char  '\007'; incr i    (* 7 *)
      | 't'  -> tack_char  '\t'; incr i    (* 9 *)
      | 'n'  -> tack_char  '\n'; incr i    (* 10 *)
      | 'r'  -> tack_char  '\r'; incr i    (* 13 *)
      | 'v'  -> tack_char  '\011'; incr i
      | 'f'  -> tack_char  '\012'; incr i
      | 'b'  -> tack_char  '\008'; incr i
      | '\\' -> tack_char  '\\'; incr i
      | '"'  -> tack_char  '"'; incr i (* NOTE OCAMLLEX BUG: TWO SPACES REQUIRED *)
      | '\'' -> tack_char  '\''; incr i
      | 'x' -> 
        begin 
          incr i;
          let j = ref 0 and value = ref 0 in
          while 
            (!i < n) & 
            (!j < hex_limit) &
            (String.contains "0123456789ABCDEFabcdef" s.[!i]) do
            value := !value * 16 + (hex_char2int s.[!i]); 
            incr i;
            incr j
          done;
          tack_utf8 !value
        end
      | 'u' -> 
        begin 
          incr i;
          let j = ref 0 and value = ref 0 in
          while 
            (!i < n) & 
            (!j < 4) &
            (String.contains "0123456789ABCDEFabcdef" s.[!i]) do
            value := !value * 16 + (hex_char2int s.[!i]); 
            incr i;
            incr j
          done;
          tack_utf8 !value
        end
      | 'U' -> 
        begin 
          incr i;
          let j = ref 0 and value = ref 0 in
          while 
            (!i < n) & 
            (!j < 8) &
            (String.contains "0123456789ABCDEFabcdef" s.[!i]) do
            value := !value * 16 + (hex_char2int s.[!i]); 
            incr i;
            incr j
          done;
          tack_utf8 !value
        end
      | 'd' -> 
        begin 
          incr i;
          let j = ref 0 and value = ref 0 in
          while 
            (!i < n) & 
            (!j < 3) &
            (String.contains "0123456789" s.[!i]) do
            value := !value * 10 + (dec_char2int s.[!i]); 
            incr i;
            incr j
          done;
          tack_utf8 !value
        end
      | 'o' -> 
        begin 
          incr i;
          let j = ref 0 and value = ref 0 in
          while 
            (!i < n) & 
            (!j < 3) &
            (String.contains "01234567" s.[!i]) do
            value := !value * 8 + (oct_char2int s.[!i]); 
            incr i;
            incr j
          done;
          tack_utf8 !value
        end
      | '0' 
      | '1' 
      | '2' 
      | '3' 
      | '4' 
      | '5' 
      | '6' 
      | '7' ->
        begin 
          let j = ref 0 and value = ref 0 in
          while 
            (!i < n) & 
            (!j < 3) &
            (String.contains "01234567" s.[!i]) do
            value := !value * 8 + (oct_char2int s.[!i]); 
            incr i;
            incr j
          done;
          tack_utf8 !value
        end


      | x -> tack_char '\\'; tack_char x;
        incr i;
    end else (tack_char s.[!i];  incr i)
  done;
  !s'
;;


----- End of forwarded message from skaller -----