Re: Unsigned integers?

From: Max Skaller (maxs@in.ot.com.au)
Date: Thu Mar 23 2000 - 03:08:54 MET

  • Next message: Max Skaller: "Re: Unsigned integers?"

    Sven LUTHER wrote:
    >
    > On Wed, Mar 22, 2000 at 09:22:15AM +1100, John Max Skaller wrote:
    > > I have some code for processing ISO-10646 characters and UTF-8,
    > > which uses caml integers. ISO-10646 has 2^31 code points, which
    > > can be covered by caml integers on a 32bit machine. Using an
    > > unboxed type is mandatory for performance.
    > >
    > > Unfortunately, caml integers are signed, which makes most of the
    > > code I have written wrong (I haven't taken the care to handle
    > > integers over 2^30 correctly).
    > >
    > > What is the best way to handle this problem?
    > > Would a (standard?) library module (written in C), that treats
    > > integers as unsigned be a reasonable solution?
    > >
    > > [This may require writing 'uint_add x y' instead of 'x+y',
    > > but that doesn't matter in the above mentioned application,
    > > since the integers are being used to represent characters]
    >
    > Just use the caml integer and ignore the fact that they are signed ?
    >
    > after the moto : that doesn't matter in the above mentioned application,

    Perhaps my explanation was unclear. In my code, I must
    calculate a UTF-8 encoding from a ISO-10646 code point,
    and calculate an ISO-10646 code point from a UTF-8 encoding.

    The code is below. The code works for values <2^30,
    but fails when and int goes negative.

    I would be happy to replace, in this code,
    evey use of 'lor', 'land', + - * < etc with
    'ulor' 'uland' 'uplus' 'uminus' 'uless' etc, if only
    I could define them. (I could do this in C .. but then,
    I could write the below routines in C too)

    Note these operations MUST be extremely fast,
    and in particular, compact storage of ISO-10646
    code points in arrays of integers is OK,
    while arrays of boxed values is out of the question.
    (So I can't use int32).

    -------------------------------------------------------

    let parse_utf8 (s : string) (i : int) : int * int =
      let ord = int_of_char
      and n = (String.length s) - i
      in if n <= 0 then begin print_endline "FAILURE"; (-1),i end
      else let lead = ord (s.[i]) in
        if (lead land 0x80) = 0 then
          lead land 0x7F,i+1 (* ASCII *)
        else if lead land 0xE0 = 0xC0 && n > 1 then
          ((lead land 0x1F) lsl 6) lor
            (ord(s.[i+1]) land 0x3F),i+2
        else if lead land 0xF0 = 0xE0 && n > 2 then
          ((lead land 0x1F) lsl 12) lor
            ((ord(s.[i+1]) land 0x3F) lsl 6) lor
            (ord(s.[i+2]) land 0x3F),i+3
        else if lead land 0xF8 = 0xF0 && n > 3 then
          ((lead land 0x1F) lsl 18) lor
            ((ord(s.[i+1]) land 0x3F) lsl 12) lor
            ((ord(s.[i+2]) land 0x3F) lsl 6) lor
            (ord(s.[i+3]) land 0x3F),i+4
        else if lead land 0xFC = 0xF8 && n > 4 then
          ((lead land 0x1F) lsl 24) lor
            ((ord(s.[i+1]) land 0x3F) lsl 18) lor
            ((ord(s.[i+2]) land 0x3F) lsl 12) lor
            ((ord(s.[i+3]) land 0x3F) lsl 6) lor
            (ord(s.[i+4]) land 0x3F),i+5
        else if lead land 0xFE = 0xFC && n > 5 then
          ((lead land 0x1F) lsl 30) lor
            ((ord(s.[i+1]) land 0x3F) lsl 24) lor
            ((ord(s.[i+2]) land 0x3F) lsl 18) lor
            ((ord(s.[i+3]) land 0x3F) lsl 12) lor
            ((ord(s.[i+4]) land 0x3F) lsl 6) lor
            (ord(s.[i+5]) land 0x3F),i+6
        else lead, i+1 (* error, just use bad character *)

    (* convert an integer into a utf-8 encoded string of bytes *)
    let utf8_of_int i =
      let chr x = String.make 1 (Char.chr x) in
      if i < 0x80 then
         chr(i)
      else if i < 0x800 then
         chr(0xC0 lor ((i lsr 6) land 0x1F)) ^
          chr(0x80 lor (i land 0x3F))
      else if i < 0x10000 then
         chr(0xE0 lor ((i lsr 12) land 0xF)) ^
          chr(0x80 lor ((i lsr 6) land 0x3F)) ^
          chr(0x80 lor (i land 0x3F))
      else if i < 0x200000 then
         chr(0xF0 lor ((i lsr 18) land 0x7)) ^
          chr(0x80 lor ((i lsr 12) land 0x3F)) ^
          chr(0x80 lor ((i lsr 6) land 0x3F)) ^
          chr(0x80 lor (i land 0x3F))
      else if i < 0x4000000 then
         chr(0xF8 lor ((i lsr 24) land 0x3)) ^
          chr(0x80 lor ((i lsr 18) land 0x3F)) ^
          chr(0x80 lor ((i lsr 12) land 0x3F)) ^
          chr(0x80 lor ((i lsr 6) land 0x3F)) ^
          chr(0x80 lor (i land 0x3F))
      else chr(0xFC lor ((i lsr 30) land 0x1)) ^
        chr(0x80 lor ((i lsr 24) land 0x3F)) ^
        chr(0x80 lor ((i lsr 18) land 0x3F)) ^
        chr(0x80 lor ((i lsr 12) land 0x3F)) ^
        chr(0x80 lor ((i lsr 6) land 0x3F)) ^
        chr(0x80 lor (i land 0x3F))

            

    -- 
    John (Max) Skaller at OTT [Open Telecommications Ltd]
    mailto:maxs@in.ot.com.au      -- at work
    mailto:skaller@maxtal.com.au  -- at home
    



    This archive was generated by hypermail 2b29 : Thu Mar 23 2000 - 13:54:56 MET