From: Pierre Weis <Pierre.Weis@inria.fr>
Message-Id: <199910211706.TAA32111@pauillac.inria.fr>
Subject: Utf8 (and other) code for ocaml (fwd)
To: caml-list@inria.fr
Date: Thu, 21 Oct 1999 19:06:08 +0200 (MET DST)
Dear Caml list readers,
Would you please note that, for security reasons, no attachments are
allowed in messages sent to you from the Caml list. So, please, do not
send messages with attachments to the list, since I'm obliged to send
these messages back, or to remove attachments by hand, as I did for
this message from John Skaller. I will not do this again.
Thank you in advance.
[Pierre Weis as moderator of the Caml list]
----- Forwarded message from skaller -----
>From weis@pauillac.inria.fr Thu Oct 21 06:52:19 1999
Sender: root@ruby
Message-ID: <380E9AA4.8AEC3665@maxtal.com.au>
Date: Thu, 21 Oct 1999 14:46:28 +1000
From: skaller <skaller@maxtal.com.au>
Organization: Maxtal
X-Mailer: Mozilla 4.51 [en] (X11; I; Linux 2.2.12 i586)
X-Accept-Language: en
MIME-Version: 1.0
CC: caml-list@inria.fr
Subject: Utf8 (and other) code for ocaml
References: <380CB30E.56D1A8A2@maxtal.com.au> <99102100543400.15513@ice>
Content-Type: multipart/mixed;
boundary="------------7909F09AC2B5B51EA3C68BFA"
-- John Skaller, mailto:skaller@maxtal.com.au 1/10 Toxteth Rd Glebe NSW 2037 Australia homepage: http://www.maxtal.com.au/~skaller downloads: http://www.triode.net.au/~skallerlet hexchar_of_int i = if i < 10 then char_of_int (i + (int_of_char '0')) else char_of_int (i- 10 + (int_of_char 'A')) ;;
let hex4 i = let j = ref i in let s = String.create 4 in for k = 0 to 3 do s.[3-k] <- hexchar_of_int (!j mod 16); j := !j / 16 done; s ;;
let escape_of_char quote ch = if ch = '\\' then "\\\\" else if ch = quote then "\\" ^ (String.make 1 quote) else if ch = '\n' then "\\n" else if ch < ' ' or ch > char_of_int 126 then "\\u" ^ (hex4 (Char.code ch)) else String.make 1 ch ;;
let escape_of_string quote x = let esc = escape_of_char quote in let res = ref "" in for i = 0 to (String.length x -1) do res := !res ^ (esc x.[i]) done; (String.make 1) quote ^ !res ^ (String.make 1 quote) ;;
let dquote_of_string = escape_of_string '"';; let quote_of_string = escape_of_string '\'';;
let hex_char2int s = let c = Char.code s in match s with _ when (s >= '0' & s <= '9') -> c - (Char.code '0') | _ when (s >= 'a' & s <= 'f') -> (c - (Char.code 'a')) + 10 | _ when (s >= 'A' & s <= 'F') -> (c - (Char.code 'A')) + 10 | _ -> raise (Py_exceptions.LexError "in hexadecimal character") ;;
let oct_char2int s = let c = Char.code s in match s with _ when (s >= '0' & s <= '7') -> c - (Char.code '0') | _ -> raise (Py_exceptions.LexError "in octal character") ;;
let dec_char2int s = let c = Char.code s in match s with _ when (s >= '0' & s <= '9') -> c - (Char.code '0') | _ -> raise (Py_exceptions.LexError "in decimal character") ;;
let len = String.length;;
let hexint_of_string s = let len = len s in let value = ref (hex_char2int s.[2]) in for i = 3 to (len - 1) do value := !value * 16 + (hex_char2int s.[i]) done; !value ;;
let hexbig_int_of_string s = let len = len s in let value = ref (Big_int.big_int_of_int (hex_char2int s.[2])) in for i = 3 to (len - 1) do value := Big_int.add_int_big_int (hex_char2int s.[i]) (Big_int.mult_int_big_int 16 !value) done; !value ;;
(* WARNING: THIS CODE WILL NOT WORK FOR THE HIGHER PLANES BECAUSE OCAML ONLY SUPPORTS 31 bit signed integers; THIS CODE REQUIRES 32 bits [This can be fixed by using negative codes but hasn't been done] *)
(* parse the first utf8 encoded character of a string s starting at index position i, return a pair consisting of the decoded integers, and the position of the first character not decoded.
If the first character is bad, it is returned, otherwise if the encoding is bad, the result is an unspecified value.
COMPATIBILITY NOTE: if this function is called with a SINGLE character string, it will return the usual value for the character, in range 0 .. 255 *)
let parse_utf8 (s:string) (i:int) : int * int = let ord = int_of_char and n = (String.length s) - i in if n <= 0 then begin print_endline "FAILURE"; (-1),i end else let lead = ord (s.[i]) in if (lead land 0x80) = 0 then lead land 0x7F,i+1 (* ASCII *) else if lead land 0xE0 = 0xC0 && n > 1 then ((lead land 0x1F) lsl 6) lor (ord(s.[i+1]) land 0x3F),i+2 else if lead land 0xF0 = 0xE0 && n > 2 then ((lead land 0x1F) lsl 12) lor ((ord(s.[i+1]) land 0x3F) lsl 6) lor (ord(s.[i+2]) land 0x3F),i+3 else if lead land 0xF8 = 0xF0 && n > 3 then ((lead land 0x1F) lsl 18) lor ((ord(s.[i+1]) land 0x3F) lsl 12) lor ((ord(s.[i+2]) land 0x3F) lsl 6) lor (ord(s.[i+3]) land 0x3F),i+4 else if lead land 0xFC = 0xF8 && n > 4 then ((lead land 0x1F) lsl 24) lor ((ord(s.[i+1]) land 0x3F) lsl 18) lor ((ord(s.[i+2]) land 0x3F) lsl 12) lor ((ord(s.[i+3]) land 0x3F) lsl 6) lor (ord(s.[i+4]) land 0x3F),i+5 else if lead land 0xFE = 0xFC && n > 5 then ((lead land 0x1F) lsl 30) lor ((ord(s.[i+1]) land 0x3F) lsl 24) lor ((ord(s.[i+2]) land 0x3F) lsl 18) lor ((ord(s.[i+3]) land 0x3F) lsl 12) lor ((ord(s.[i+4]) land 0x3F) lsl 6) lor (ord(s.[i+5]) land 0x3F),i+6 else lead, i+1 (* error, just use bad character *) ;;
(* convert an integer into a utf-8 encoded string of bytes *) let utf8_of_int i = let chr x = String.make 1 (Char.chr x) in if i < 0x80 then chr(i) else if i < 0x800 then chr(0xC0 lor ((i lsr 6) land 0x1F)) ^ chr(0x80 lor (i land 0x3F)) else if i < 0x10000 then chr(0xE0 lor ((i lsr 12) land 0xF)) ^ chr(0x80 lor ((i lsr 6) land 0x3F)) ^ chr(0x80 lor (i land 0x3F)) else if i < 0x200000 then chr(0xF0 lor ((i lsr 18) land 0x7)) ^ chr(0x80 lor ((i lsr 12) land 0x3F)) ^ chr(0x80 lor ((i lsr 6) land 0x3F)) ^ chr(0x80 lor (i land 0x3F)) else if i < 0x4000000 then chr(0xF8 lor ((i lsr 24) land 0x3)) ^ chr(0x80 lor ((i lsr 18) land 0x3F)) ^ chr(0x80 lor ((i lsr 12) land 0x3F)) ^ chr(0x80 lor ((i lsr 6) land 0x3F)) ^ chr(0x80 lor (i land 0x3F)) else chr(0xFC lor ((i lsr 30) land 0x1)) ^ chr(0x80 lor ((i lsr 24) land 0x3F)) ^ chr(0x80 lor ((i lsr 18) land 0x3F)) ^ chr(0x80 lor ((i lsr 12) land 0x3F)) ^ chr(0x80 lor ((i lsr 6) land 0x3F)) ^ chr(0x80 lor (i land 0x3F)) ;;
let unescape s = let hex_limit = 2 in let n = len s in let s' = ref "" in let tack_string ss = s':= !s' ^ ss in let tack_char ch = tack_string (String.make 1 ch) in let tack_utf8 code = tack_string (utf8_of_int code) in let i= ref 0 in while !i< n do let ch = s.[!i] in if ch = '\\' then begin incr i; if !i = n then tack_char '\\' else match s.[!i] with | 'a' -> tack_char '\007'; incr i (* 7 *) | 't' -> tack_char '\t'; incr i (* 9 *) | 'n' -> tack_char '\n'; incr i (* 10 *) | 'r' -> tack_char '\r'; incr i (* 13 *) | 'v' -> tack_char '\011'; incr i | 'f' -> tack_char '\012'; incr i | 'b' -> tack_char '\008'; incr i | '\\' -> tack_char '\\'; incr i | '"' -> tack_char '"'; incr i (* NOTE OCAMLLEX BUG: TWO SPACES REQUIRED *) | '\'' -> tack_char '\''; incr i | 'x' -> begin incr i; let j = ref 0 and value = ref 0 in while (!i < n) & (!j < hex_limit) & (String.contains "0123456789ABCDEFabcdef" s.[!i]) do value := !value * 16 + (hex_char2int s.[!i]); incr i; incr j done; tack_utf8 !value end | 'u' -> begin incr i; let j = ref 0 and value = ref 0 in while (!i < n) & (!j < 4) & (String.contains "0123456789ABCDEFabcdef" s.[!i]) do value := !value * 16 + (hex_char2int s.[!i]); incr i; incr j done; tack_utf8 !value end | 'U' -> begin incr i; let j = ref 0 and value = ref 0 in while (!i < n) & (!j < 8) & (String.contains "0123456789ABCDEFabcdef" s.[!i]) do value := !value * 16 + (hex_char2int s.[!i]); incr i; incr j done; tack_utf8 !value end | 'd' -> begin incr i; let j = ref 0 and value = ref 0 in while (!i < n) & (!j < 3) & (String.contains "0123456789" s.[!i]) do value := !value * 10 + (dec_char2int s.[!i]); incr i; incr j done; tack_utf8 !value end | 'o' -> begin incr i; let j = ref 0 and value = ref 0 in while (!i < n) & (!j < 3) & (String.contains "01234567" s.[!i]) do value := !value * 8 + (oct_char2int s.[!i]); incr i; incr j done; tack_utf8 !value end | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' -> begin let j = ref 0 and value = ref 0 in while (!i < n) & (!j < 3) & (String.contains "01234567" s.[!i]) do value := !value * 8 + (oct_char2int s.[!i]); incr i; incr j done; tack_utf8 !value end
| x -> tack_char '\\'; tack_char x; incr i; end else (tack_char s.[!i]; incr i) done; !s' ;;
----- End of forwarded message from skaller -----
This archive was generated by hypermail 2b29 : Sun Jan 02 2000 - 11:58:27 MET