Version française
Home     About     Download     Resources     Contact us    
Browse thread
Inlining bigarray access
[ Home ] [ Index: by date | by threads ]
[ Search: ]

[ Message by date: previous | next ] [ Message in thread: previous | next ] [ Thread: previous | next ]
Date: -- (:)
From: Dmitry Bely <dmitry.bely@g...>
Subject: Inlining bigarray access
Here is the small code snippet:

open Bigarray

type floatarray = (float, float32_elt, c_layout) Array1.t

let get (a: floatarray) i = Array1.get a i

let test3 (a:floatarray) i =
  Array1.get a (i-1) > 0.0 && Array1.get a i > 0.0 && Array1.get a (i+1) > 0.0

let test3m (a:floatarray) i =
  get a (i-1) > 0.0 && get a i > 0.0 && get a (i+1) > 0.0

My impression was that test3 and test3m should be compiled to the same
machine code (x86). Strangely, this is not the case (for the
simplicity the code is generated with patched ocamlopt, allowing
-unsafe bigarray access and 686+ float comparison instructions):

	.CODE
	ALIGN	4
	PUBLIC	_camlBig__test3_187
_camlBig__test3_187:
	sub	esp, 8
L108:
	mov	ecx, eax
	fldz
	mov	edx, ebx
	add	edx, -2
	sar	edx, 1
	mov	eax, DWORD PTR [ecx+4]
	fld	REAL4 PTR [eax+edx*4]
	fcomip	st(0), st(1)
	fstp	st(0)
	jbe	L104
	fldz
	mov	edx, ebx
	sar	edx, 1
	mov	eax, DWORD PTR [ecx+4]
	fld	REAL4 PTR [eax+edx*4]
	fcomip	st(0), st(1)
	fstp	st(0)
	jbe	L105
	fldz
	add	ebx, 2
	sar	ebx, 1
	mov	eax, DWORD PTR [ecx+4]
	fld	REAL4 PTR [eax+ebx*4]
	fcomip	st(0), st(1)
	fstp	st(0)
	jbe	L107
	mov	eax, 1
	jmp	L106
L107:
	xor	eax, eax
L106:
	lea	eax, DWORD PTR [eax+eax+1]
	add    esp, 8
	ret
L105:
	mov	eax, 1
	add    esp, 8
	ret
L104:
	mov	eax, 1
	add    esp, 8
	ret
	.CODE
	ALIGN	4
	PUBLIC	_camlBig__test3m_190
_camlBig__test3m_190:
	sub	esp, 8
L113:
	mov	ecx, eax
	mov	edx, ebx
	add	edx, -2
L114:	mov	eax, _caml_young_ptr
	sub	eax, 12
	mov	_caml_young_ptr, eax
	cmp	eax, _caml_young_limit
	jb	L115
	lea	esi, [eax+4]
	mov	DWORD PTR [esi-4],2301
	sar	edx, 1
	mov	eax, DWORD PTR [ecx+4]
	fld	REAL4 PTR [eax+edx*4]
	fstp	REAL8 PTR [esi]
	fld	REAL8 PTR [esi]
	fstp	REAL8 PTR 0[esp]
	fldz
	fld	REAL8 PTR 0[esp]
	fcomip	st(0), st(1)
	fstp	st(0)
	jbe	L109
	fldz
	mov	edx, ebx
	sar	edx, 1
	mov	eax, DWORD PTR [ecx+4]
	fld	REAL4 PTR [eax+edx*4]
	fcomip	st(0), st(1)
	fstp	st(0)
	jbe	L110
	add	ebx, 2
L117:	mov	eax, _caml_young_ptr
	sub	eax, 12
	mov	_caml_young_ptr, eax
	cmp	eax, _caml_young_limit
	jb	L118
	lea	edx, [eax+4]
	mov	DWORD PTR [edx-4],2301
	sar	ebx, 1
	mov	eax, DWORD PTR [ecx+4]
	fld	REAL4 PTR [eax+ebx*4]
	fstp	REAL8 PTR [edx]
	fld	REAL8 PTR [edx]
	fstp	REAL8 PTR 0[esp]
	fldz
	fld	REAL8 PTR 0[esp]
	fcomip	st(0), st(1)
	fstp	st(0)
	jbe	L112
	mov	eax, 1
	jmp	L111
L112:
	xor	eax, eax
L111:
	sal	eax, 1
	inc	eax
	add    esp, 8
	ret
L110:
	mov	eax, 1
	add    esp, 8
	ret
L109:
	mov	eax, 1
	add    esp, 8
	ret
L118:	call	_caml_call_gc
L119:	jmp	L117
L115:	call	_caml_call_gc
L116:	jmp	L114

Could you explain, why the code is different and get3m allocates some
data on the heap? (Don't blame me for unsafe/686 patch - without it
the assembly is still different).

Another strange thing is that

let get: (floatarray -> int -> float) = Array1.get

is not inlined at all and compiled as a C call. Why?

- Dmitry Bely