Version française
Home     About     Download     Resources     Contact us    
Browse thread
Performance questions, -inline, ...
[ Home ] [ Index: by date | by threads ]
[ Search: ]

[ Message by date: previous | next ] [ Message in thread: previous | next ] [ Thread: previous | next ]
Date: -- (:)
From: Edgar Friendly <thelema314@g...>
Subject: Re: [Caml-list] Performance questions, -inline, ...

On Thu, 2008-01-03 at 11:28 -0500, Kuba Ober wrote:
> I haven't looked at assembly output yet, but I've run into some unexpected
> behavior in my benchmarks.
> 
> This was compiled by ocamlopt -inline 100 -unsafe, the results and code are 
> below (MIPS is obtained by dividing 50 million iterations by (Unix.times 
> ()) . Unix.tms_utime it took to run). I haven't included the timing etc. code 
> (it's part of a larger benchmark).
> 
> What I wonder is why vector-to-vector add is so much faster than (constant) 
> scalar to vector add. Vectors are preinitialized each time with a 1.0000, 
> 1.0001, ... sequence.
> 
> Also, the very bad performance from generic vector-to-vector *with* inlining 
> is another puzzler, whereas generic add of scalar-to-scalar performs 
> similarly to straight-coded one.
> 
> Cheers, Kuba
> 
> * add1: add scalar to scalar   120 MIPS
> * add3: add scalar to vector   250 MIPS
> * add5: add vector to vector   320 MIPS
> * add2: generic add scalar to scalar   100 MIPS
> * add4: generic add vector to vector   38 MIPS
> 
> let start = 1.3
> 
> (* generic scalar operation *)
> let op1 op const nloop =
> 	let accum = ref start in
> 	for i = 1 to nloop do
> 		accum := op !accum const
> 	done
> 
> (* generic vector operation *)
> let op2 op const a b (nloop : int) =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- op a.(i) b.(i)
> 		done;
> 	done
> 
> (** addition **)
> let add1 nloop =
> 	let accum = ref start in
> 	for i = 1 to nloop do
> 		accum := !accum +. addconst
> 	done
> let add2 = op1 ( +. ) addconst
> let add3 a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- a.(i) +. addconst
> 		done;
> 	done
> let add4 = op2 ( +. ) addconst
> let add5 a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- a.(i) +. b.(i)
> 		done;
> 	done
> 
how about:

(* generic vector operation *)
let op2 op a b nloop =
	let len = Array.length a in
	for j = 0 to nloop do
		for i = 0 to len-1 do
			b.(i) <- op a.(i) b.(i)
		done;
	done

let add4 = op2 (+.)


Why does your code have the j loops?  You add a constant (or vector
element) a number of times equal to the length of your vector?  Do you
mean for j = 1 to nloop do ...  And I'd move that out of the test
function if I could, into the testing harness.

Arrays of floats have some optimizations built in to the compiler (no
boxing, even though they're not 31-bit values), so you should get as
good performance as you'll get.  

E.