Forum > Windows
sources with Orcas
_heinz:
have now some modified code versions under measuring and tuning.
AKFCOMP Alex modified compact code
PFCASE modified case construct of pulsefind.cpp
PFLOOP compact loop construct of pulsefind.cpp
FPUCOMP compact construct of opt_FPU.cpp
heinz
_heinz:
A look at the asm code shows SIMD instructions are used and
how opt_v_GetPowerSpectrum performs as a part of FPUCOMP
3 MMX register (XMM0, XMM1, XMM2) are used to handle powerful MMX-Instructions
but keep in mind, all code must be measured..... ;)
heinz
---------------------------------------------------------------------------------------------------------------------------------------
PUBLIC ?opt_v_GetPowerSpectrum@@YAXPAY01MPAMHHH@Z ; opt_v_GetPowerSpectrum
EXTRN __fltused:DWORD
; Function compile flags: /Ogtpy
; File c:\i\sc\pultimb_5\optimizer\opt_fpu.cpp
; COMDAT ?opt_v_GetPowerSpectrum@@YAXPAY01MPAMHHH@Z
_TEXT SEGMENT
tv1161 = -8 ; size = 4
_i$ = -4 ; size = 4
_FreqData$ = 8 ; size = 4
_PowerSpectrum$ = 12 ; size = 4
_this_fft_len$ = 16 ; size = 4
_bin_off$ = 20 ; size = 4
_bin_len$ = 24 ; size = 4
?opt_v_GetPowerSpectrum@@YAXPAY01MPAMHHH@Z PROC ; opt_v_GetPowerSpectrum, COMDAT
; 36 : register int i, bin; //seti_britta: register
; 37 : float *workBuf = (float *)FreqData;
; 38 : // float psNum; //seti_britta: no longer necessary
; 39 :
; 40 : ALIGNED_YES( FreqData );
; 41 : ALIGNED_YES( PowerSpectrum );
; 42 : for ( i = 0, bin = 0; i < this_fft_len; i++, bin += bin_len)
mov eax, DWORD PTR _FreqData$[esp-4]
sub esp, 8
push ebx
push ebp
mov ebp, DWORD PTR _this_fft_len$[esp+12]
xor ecx, ecx
xor ebx, ebx
cmp ebp, 4
push esi
mov esi, DWORD PTR _bin_len$[esp+16]
jl $LC9@opt_v_GetP
mov edx, DWORD PTR _PowerSpectrum$[esp+16]
mov ecx, DWORD PTR _bin_off$[esp+16]
add ebp, -4 ; fffffffcH
shr ebp, 2
inc ebp
mov DWORD PTR tv1161[esp+20], ebp
lea ecx, DWORD PTR [edx+ecx*4]
add ebp, ebp
lea edx, DWORD PTR [eax+8]
add eax, 12 ; 0000000cH
add ebp, ebp
push edi
mov DWORD PTR _i$[esp+24], ebp
mov ebp, DWORD PTR tv1161[esp+24]
lea edi, DWORD PTR [esi*4]
npad 1
$LL10@opt_v_GetP:
; 43 : {
; 44 : // psNum = FreqData[0] * FreqData[0] + FreqData[1] * FreqData[1];
; 45 : // PowerSpectrum[bin_off + bin] = // Large cache miss here...can it be fixed?
; 46 : // workBuf = psNum;
; 47 : //seti_britta: new statement
; 48 : PowerSpectrum[bin_off + bin] = workBuf = (FreqData[0] * FreqData[0]) + (FreqData[1] * FreqData[1]);
movss xmm1, DWORD PTR [eax-8]
movss xmm0, DWORD PTR [eax-12]
mulss xmm0, xmm0
movaps xmm2, xmm1
mulss xmm2, xmm1
addss xmm0, xmm2
movss DWORD PTR [edx-8], xmm0
movss DWORD PTR [ecx], xmm0
movss xmm1, DWORD PTR [eax-4]
movss xmm0, DWORD PTR [eax]
mulss xmm0, xmm0
add ecx, edi
movaps xmm2, xmm1
mulss xmm2, xmm1
addss xmm0, xmm2
movss DWORD PTR [edx-4], xmm0
movss DWORD PTR [ecx], xmm0
movss xmm1, DWORD PTR [eax+4]
movss xmm0, DWORD PTR [eax+8]
mulss xmm0, xmm0
add ecx, edi
movaps xmm2, xmm1
mulss xmm2, xmm1
addss xmm0, xmm2
movss DWORD PTR [edx], xmm0
movss DWORD PTR [ecx], xmm0
movss xmm1, DWORD PTR [eax+12]
movss xmm0, DWORD PTR [eax+16]
add ebx, esi
add ebx, esi
add ecx, edi
movaps xmm2, xmm1
mulss xmm0, xmm0
mulss xmm2, xmm1
addss xmm0, xmm2
add ebx, esi
movss DWORD PTR [edx+4], xmm0
movss DWORD PTR [ecx], xmm0
add ebx, esi
add ecx, edi
add edx, 16 ; 00000010H
add eax, 32 ; 00000020H
sub ebp, 1
jne $LL10@opt_v_GetP
mov ebp, DWORD PTR _this_fft_len$[esp+20]
mov eax, DWORD PTR _FreqData$[esp+20]
mov ecx, DWORD PTR _i$[esp+24]
pop edi
$LC9@opt_v_GetP:
; 36 : register int i, bin; //seti_britta: register
; 37 : float *workBuf = (float *)FreqData;
; 38 : // float psNum; //seti_britta: no longer necessary
; 39 :
; 40 : ALIGNED_YES( FreqData );
; 41 : ALIGNED_YES( PowerSpectrum );
; 42 : for ( i = 0, bin = 0; i < this_fft_len; i++, bin += bin_len)
cmp ecx, ebp
jge SHORT $LN8@opt_v_GetP
mov edx, DWORD PTR _bin_off$[esp+16]
add esi, esi
add esi, esi
add ebx, edx
mov edx, DWORD PTR _PowerSpectrum$[esp+16]
lea edx, DWORD PTR [edx+ebx*4]
npad 9
$LC3@opt_v_GetP:
; 43 : {
; 44 : // psNum = FreqData[0] * FreqData[0] + FreqData[1] * FreqData[1];
; 45 : // PowerSpectrum[bin_off + bin] = // Large cache miss here...can it be fixed?
; 46 : // workBuf = psNum;
; 47 : //seti_britta: new statement
; 48 : PowerSpectrum[bin_off + bin] = workBuf = (FreqData[0] * FreqData[0]) + (FreqData[1] * FreqData[1]);
movss xmm1, DWORD PTR [eax+ecx*8+4]
movss xmm0, DWORD PTR [eax+ecx*8]
movaps xmm2, xmm1
mulss xmm0, xmm0
mulss xmm2, xmm1
addss xmm0, xmm2
movss DWORD PTR [eax+ecx*4], xmm0
movss DWORD PTR [edx], xmm0
inc ecx
add edx, esi
cmp ecx, ebp
jl SHORT $LC3@opt_v_GetP
$LN8@opt_v_GetP:
pop esi
pop ebp
pop ebx
; 49 :
; 50 : }
; 51 : }
add esp, 8
ret 0
?opt_v_GetPowerSpectrum@@YAXPAY01MPAMHHH@Z ENDP ; opt_v_GetPowerSpectrum
Jason G:
hmmm, time to get out the p4 optimisation reference. Some of the things in those SSE loops might be really good on a core/core2, don't know, but I'm a little bit wierded out by a few things :o. Some of the ordering of the instructions could be improved, (not entirely convinced out-of-order execution would fix that). I also think that where I think the core2 likes moderately tight loops, as would seem to be designed, the p4 might appreciate a further manual unroll.( or maybe the other way ! both worth a try due to different architecture generation. )
Did you use any optimisation yet ?(or is that the cleaned up version of what I did with QxN? nope different function. whew.) l'm a bit surprised at some of the code generated.
Jason
_heinz:
No hurry with all this...
all must be measured and running against the original-code version to see if there is a real progress. It is easy to destroy a well performed loop with some simple changes. As Joe mentioned, its not a good idea to use the pultime project to measure code by using the MS-compiler(Have not as my own the whole Intel Performance package, with Intel-Compiler, VTune and so on).
Using now the etimer-project to see any differences.
In this way we can better test short code-pieces.
And if we found any progress it is at least necessary to compile with Intel-compiler to see if it really rocks.
Do you like my sight of view to see equation systems looking at the code ?
Resolutions can be found by using mathematical methods.
Therefore some of my code-constructs are a little bit crazy.
But you know mathematicians are crazy people.
And I´m a mathematician. ;D
Regards heinz
Jason G:
No, No hurry, my holidays are coming in a few weeks, I still consider this "Orientation".
------------------------Detour------------
--- Quote ---And I´m a mathematician.
--- End quote ---
I think the Mathematician's Anonymous meeting is three doors down ...
Mathematicians may be crazy or not, But I always wondered where my lecturers did 'stash their flagons' before class, and who knitted them the stylish brown vest that is two sizes too small :D (jokes).
If you have a formula to get crazy, I can examine its computational complexity well enough (comp sci) though a little out of practice, then build it in hardware to IPC class 3 military standards (Electronic Engineering).
It is a shame the algorithm for sanity is O(n^3) and requires too many connections to implement on an FPGA, so sorry I can't help you there :D [Though I can't offer any sanity, I do have a drawer full of high speed logic I can let you dig through ....]
-------------------------------------------------------------------------
My p4 selects PwrSpectrumOnly_ptt( sse_GetPSO_sc16_npr ) take a look it is nice, No Author's name is listed :(. Maybe it is Ben & Joe?
- It has care with the SSE pipelines (Even is laid out showing them :D)
- It Is using many more registers (Those aren't variable assignments really, no ;) )
- It looks to be unrolled to help the pipelines/cache/prefetch, very pretty :D.
I'm more impressed with that function, It may help you to compare to that so you can see how to use the hardware better.
Jason
Navigation
[0] Message Index
[#] Next page
[*] Previous page
Go to full version