Version française
Home     About     Download     Resources     Contact us    
Browse thread
Re: Syntax for label
[ Home ] [ Index: by date | by threads ]
[ Search: ]

[ Message by date: previous | next ] [ Message in thread: previous | next ] [ Thread: previous | next ]
Date: -- (:)
From: Max Skaller <maxs@i...>
Subject: Re: Unsigned integers?
Sven LUTHER wrote:
> 
> On Wed, Mar 22, 2000 at 09:22:15AM +1100, John Max Skaller wrote:
> > I have some code for processing ISO-10646 characters and UTF-8,
> > which uses caml integers. ISO-10646 has 2^31 code points, which
> > can be covered by caml integers on a 32bit machine. Using an
> > unboxed type is mandatory for performance.
> >
> > Unfortunately, caml integers are signed, which makes most of the
> > code I have written wrong (I haven't taken the care to handle
> > integers over 2^30 correctly).
> >
> > What is the best way to handle this problem?
> > Would a (standard?) library module (written in C), that treats
> > integers as unsigned be a reasonable solution?
> >
> > [This may require writing 'uint_add x y' instead of 'x+y',
> > but that doesn't matter in the above mentioned application,
> > since the integers are being used to represent characters]
> 
> Just use the caml integer and ignore the fact that they are signed ?
> 
> after the moto :  that doesn't matter in the above mentioned application,

Perhaps my explanation was unclear. In my code, I must 
calculate a UTF-8 encoding from a ISO-10646 code point,
and calculate an ISO-10646 code point from a UTF-8 encoding.

The code is below. The code works for values <2^30,
but fails when and int goes negative.

I would be happy to replace, in this code,
evey use of 'lor', 'land', + - * < etc with
'ulor' 'uland' 'uplus' 'uminus' 'uless' etc, if only
I could define them. (I could do this in C .. but then,
I could write the below routines in C too)

Note these operations MUST be extremely fast,
and in particular, compact storage of ISO-10646
code points in arrays of integers is OK,
while arrays of boxed values is out of the question.
(So I can't use int32).

-------------------------------------------------------


let parse_utf8 (s : string)  (i : int) : int * int =
  let ord = int_of_char 
  and n = (String.length s)  - i
  in if n <= 0 then begin print_endline "FAILURE"; (-1),i end
  else let lead = ord (s.[i]) in
    if (lead land 0x80) = 0 then 
      lead land 0x7F,i+1 (* ASCII *)
    else if lead land 0xE0 = 0xC0 && n > 1 then
      ((lead land 0x1F)  lsl  6) lor
        (ord(s.[i+1]) land 0x3F),i+2
    else if lead land 0xF0 = 0xE0 && n > 2 then
      ((lead land 0x1F) lsl 12) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 6) lor
        (ord(s.[i+2]) land 0x3F),i+3
    else if lead land 0xF8 = 0xF0 && n > 3 then
      ((lead land 0x1F) lsl 18) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 6) lor
        (ord(s.[i+3]) land 0x3F),i+4
    else if lead land 0xFC = 0xF8 && n > 4 then
      ((lead land 0x1F) lsl 24) lor 
        ((ord(s.[i+1]) land 0x3F)  lsl 18) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+3]) land 0x3F)  lsl 6) lor
        (ord(s.[i+4]) land 0x3F),i+5
    else if lead land 0xFE = 0xFC && n > 5 then
      ((lead land 0x1F) lsl 30) lor
        ((ord(s.[i+1]) land 0x3F)  lsl 24) lor
        ((ord(s.[i+2]) land 0x3F)  lsl 18) lor
        ((ord(s.[i+3]) land 0x3F)  lsl 12) lor
        ((ord(s.[i+4]) land 0x3F)  lsl 6) lor
        (ord(s.[i+5]) land 0x3F),i+6
    else lead, i+1  (* error, just use bad character *)

(* convert an integer into a utf-8 encoded string of bytes *)
let utf8_of_int i =
  let chr x = String.make 1 (Char.chr x) in
  if i < 0x80 then 
     chr(i)
  else if i < 0x800 then 
     chr(0xC0 lor ((i lsr 6) land 0x1F))  ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x10000 then 
     chr(0xE0 lor ((i lsr 12) land 0xF)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x200000 then 
     chr(0xF0 lor ((i lsr 18) land 0x7)) ^
      chr(0x80 lor ((i lsr 12) land 0x3F)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else if i < 0x4000000 then 
     chr(0xF8 lor ((i lsr 24) land 0x3)) ^
      chr(0x80 lor ((i lsr 18) land 0x3F)) ^
      chr(0x80 lor ((i lsr 12) land 0x3F)) ^
      chr(0x80 lor ((i lsr 6) land 0x3F)) ^
      chr(0x80 lor (i land 0x3F))
  else chr(0xFC lor ((i lsr 30) land 0x1)) ^
    chr(0x80 lor ((i lsr 24) land 0x3F)) ^
    chr(0x80 lor ((i lsr 18) land 0x3F)) ^
    chr(0x80 lor ((i lsr 12) land 0x3F)) ^
    chr(0x80 lor ((i lsr 6) land 0x3F)) ^
    chr(0x80 lor (i land 0x3F))


	

-- 
John (Max) Skaller at OTT [Open Telecommications Ltd]
mailto:maxs@in.ot.com.au      -- at work
mailto:skaller@maxtal.com.au  -- at home