Sven LUTHER wrote:
>
> On Wed, Mar 22, 2000 at 09:22:15AM +1100, John Max Skaller wrote:
> > I have some code for processing ISO-10646 characters and UTF-8,
> > which uses caml integers. ISO-10646 has 2^31 code points, which
> > can be covered by caml integers on a 32bit machine. Using an
> > unboxed type is mandatory for performance.
> >
> > Unfortunately, caml integers are signed, which makes most of the
> > code I have written wrong (I haven't taken the care to handle
> > integers over 2^30 correctly).
> >
> > What is the best way to handle this problem?
> > Would a (standard?) library module (written in C), that treats
> > integers as unsigned be a reasonable solution?
> >
> > [This may require writing 'uint_add x y' instead of 'x+y',
> > but that doesn't matter in the above mentioned application,
> > since the integers are being used to represent characters]
>
> Just use the caml integer and ignore the fact that they are signed ?
>
> after the moto : that doesn't matter in the above mentioned application,
Perhaps my explanation was unclear. In my code, I must
calculate a UTF-8 encoding from a ISO-10646 code point,
and calculate an ISO-10646 code point from a UTF-8 encoding.
The code is below. The code works for values <2^30,
but fails when and int goes negative.
I would be happy to replace, in this code,
evey use of 'lor', 'land', + - * < etc with
'ulor' 'uland' 'uplus' 'uminus' 'uless' etc, if only
I could define them. (I could do this in C .. but then,
I could write the below routines in C too)
Note these operations MUST be extremely fast,
and in particular, compact storage of ISO-10646
code points in arrays of integers is OK,
while arrays of boxed values is out of the question.
(So I can't use int32).
-------------------------------------------------------
let parse_utf8 (s : string) (i : int) : int * int =
let ord = int_of_char
and n = (String.length s) - i
in if n <= 0 then begin print_endline "FAILURE"; (-1),i end
else let lead = ord (s.[i]) in
if (lead land 0x80) = 0 then
lead land 0x7F,i+1 (* ASCII *)
else if lead land 0xE0 = 0xC0 && n > 1 then
((lead land 0x1F) lsl 6) lor
(ord(s.[i+1]) land 0x3F),i+2
else if lead land 0xF0 = 0xE0 && n > 2 then
((lead land 0x1F) lsl 12) lor
((ord(s.[i+1]) land 0x3F) lsl 6) lor
(ord(s.[i+2]) land 0x3F),i+3
else if lead land 0xF8 = 0xF0 && n > 3 then
((lead land 0x1F) lsl 18) lor
((ord(s.[i+1]) land 0x3F) lsl 12) lor
((ord(s.[i+2]) land 0x3F) lsl 6) lor
(ord(s.[i+3]) land 0x3F),i+4
else if lead land 0xFC = 0xF8 && n > 4 then
((lead land 0x1F) lsl 24) lor
((ord(s.[i+1]) land 0x3F) lsl 18) lor
((ord(s.[i+2]) land 0x3F) lsl 12) lor
((ord(s.[i+3]) land 0x3F) lsl 6) lor
(ord(s.[i+4]) land 0x3F),i+5
else if lead land 0xFE = 0xFC && n > 5 then
((lead land 0x1F) lsl 30) lor
((ord(s.[i+1]) land 0x3F) lsl 24) lor
((ord(s.[i+2]) land 0x3F) lsl 18) lor
((ord(s.[i+3]) land 0x3F) lsl 12) lor
((ord(s.[i+4]) land 0x3F) lsl 6) lor
(ord(s.[i+5]) land 0x3F),i+6
else lead, i+1 (* error, just use bad character *)
(* convert an integer into a utf-8 encoded string of bytes *)
let utf8_of_int i =
let chr x = String.make 1 (Char.chr x) in
if i < 0x80 then
chr(i)
else if i < 0x800 then
chr(0xC0 lor ((i lsr 6) land 0x1F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x10000 then
chr(0xE0 lor ((i lsr 12) land 0xF)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x200000 then
chr(0xF0 lor ((i lsr 18) land 0x7)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x4000000 then
chr(0xF8 lor ((i lsr 24) land 0x3)) ^
chr(0x80 lor ((i lsr 18) land 0x3F)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else chr(0xFC lor ((i lsr 30) land 0x1)) ^
chr(0x80 lor ((i lsr 24) land 0x3F)) ^
chr(0x80 lor ((i lsr 18) land 0x3F)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
-- John (Max) Skaller at OTT [Open Telecommications Ltd] mailto:maxs@in.ot.com.au -- at work mailto:skaller@maxtal.com.au -- at home
This archive was generated by hypermail 2b29 : Thu Mar 23 2000 - 13:54:56 MET