Forum > Windows

sources with Orcas

<< < (7/10) > >>

_heinz:
have now some modified code versions under measuring and tuning.
AKFCOMP Alex modified compact code
PFCASE  modified  case construct of pulsefind.cpp
PFLOOP compact loop construct of pulsefind.cpp
FPUCOMP compact construct of opt_FPU.cpp

heinz

_heinz:
A look at the asm code shows SIMD instructions are used and
how opt_v_GetPowerSpectrum performs as a part of FPUCOMP
3 MMX register (XMM0, XMM1, XMM2) are used to handle powerful MMX-Instructions
but keep in mind, all code must be measured..... ;)
heinz
 ---------------------------------------------------------------------------------------------------------------------------------------
PUBLIC   ?opt_v_GetPowerSpectrum@@YAXPAY01MPAMHHH@Z   ; opt_v_GetPowerSpectrum
EXTRN   __fltused:DWORD
; Function compile flags: /Ogtpy
; File c:\i\sc\pultimb_5\optimizer\opt_fpu.cpp
;   COMDAT ?opt_v_GetPowerSpectrum@@YAXPAY01MPAMHHH@Z
_TEXT   SEGMENT
tv1161 = -8                  ; size = 4
_i$ = -4                  ; size = 4
_FreqData$ = 8                  ; size = 4
_PowerSpectrum$ = 12               ; size = 4
_this_fft_len$ = 16               ; size = 4
_bin_off$ = 20                  ; size = 4
_bin_len$ = 24                  ; size = 4
?opt_v_GetPowerSpectrum@@YAXPAY01MPAMHHH@Z PROC      ; opt_v_GetPowerSpectrum, COMDAT

; 36   :    register int   i, bin; //seti_britta: register
; 37   :    float *workBuf = (float *)FreqData;
; 38   : //   float psNum; //seti_britta: no longer necessary
; 39   :
; 40   :    ALIGNED_YES( FreqData );
; 41   :    ALIGNED_YES( PowerSpectrum );
; 42   :    for   ( i   = 0, bin = 0; i < this_fft_len; i++, bin += bin_len)

   mov   eax, DWORD PTR _FreqData$[esp-4]
   sub   esp, 8
   push   ebx
   push   ebp
   mov   ebp, DWORD PTR _this_fft_len$[esp+12]
   xor   ecx, ecx
   xor   ebx, ebx
   cmp   ebp, 4
   push   esi
   mov   esi, DWORD PTR _bin_len$[esp+16]
   jl   $LC9@opt_v_GetP
   mov   edx, DWORD PTR _PowerSpectrum$[esp+16]
   mov   ecx, DWORD PTR _bin_off$[esp+16]
   add   ebp, -4               ; fffffffcH
   shr   ebp, 2
   inc   ebp
   mov   DWORD PTR tv1161[esp+20], ebp
   lea   ecx, DWORD PTR [edx+ecx*4]
   add   ebp, ebp
   lea   edx, DWORD PTR [eax+8]
   add   eax, 12               ; 0000000cH
   add   ebp, ebp
   push   edi
   mov   DWORD PTR _i$[esp+24], ebp
   mov   ebp, DWORD PTR tv1161[esp+24]
   lea   edi, DWORD PTR [esi*4]
   npad   1
$LL10@opt_v_GetP:

; 43   :       {
; 44   : //      psNum = FreqData[0] * FreqData[0] + FreqData[1] * FreqData[1];
; 45   : //      PowerSpectrum[bin_off + bin] =   // Large cache miss here...can it be fixed?
; 46   : //      workBuf = psNum;
; 47   : //seti_britta: new statement
; 48   :       PowerSpectrum[bin_off + bin] = workBuf = (FreqData[0] * FreqData[0]) + (FreqData[1] * FreqData[1]);

   movss   xmm1, DWORD PTR [eax-8]
   movss   xmm0, DWORD PTR [eax-12]
   mulss   xmm0, xmm0
   movaps   xmm2, xmm1
   mulss   xmm2, xmm1
   addss   xmm0, xmm2
   movss   DWORD PTR [edx-8], xmm0
   movss   DWORD PTR [ecx], xmm0
   movss   xmm1, DWORD PTR [eax-4]
   movss   xmm0, DWORD PTR [eax]
   mulss   xmm0, xmm0
   add   ecx, edi
   movaps   xmm2, xmm1
   mulss   xmm2, xmm1
   addss   xmm0, xmm2
   movss   DWORD PTR [edx-4], xmm0
   movss   DWORD PTR [ecx], xmm0
   movss   xmm1, DWORD PTR [eax+4]
   movss   xmm0, DWORD PTR [eax+8]
   mulss   xmm0, xmm0
   add   ecx, edi
   movaps   xmm2, xmm1
   mulss   xmm2, xmm1
   addss   xmm0, xmm2
   movss   DWORD PTR [edx], xmm0
   movss   DWORD PTR [ecx], xmm0
   movss   xmm1, DWORD PTR [eax+12]
   movss   xmm0, DWORD PTR [eax+16]
   add   ebx, esi
   add   ebx, esi
   add   ecx, edi
   movaps   xmm2, xmm1
   mulss   xmm0, xmm0
   mulss   xmm2, xmm1
   addss   xmm0, xmm2
   add   ebx, esi
   movss   DWORD PTR [edx+4], xmm0
   movss   DWORD PTR [ecx], xmm0
   add   ebx, esi
   add   ecx, edi
   add   edx, 16               ; 00000010H
   add   eax, 32               ; 00000020H
   sub   ebp, 1
   jne   $LL10@opt_v_GetP
   mov   ebp, DWORD PTR _this_fft_len$[esp+20]
   mov   eax, DWORD PTR _FreqData$[esp+20]
   mov   ecx, DWORD PTR _i$[esp+24]
   pop   edi
$LC9@opt_v_GetP:

; 36   :    register int   i, bin; //seti_britta: register
; 37   :    float *workBuf = (float *)FreqData;
; 38   : //   float psNum; //seti_britta: no longer necessary
; 39   :
; 40   :    ALIGNED_YES( FreqData );
; 41   :    ALIGNED_YES( PowerSpectrum );
; 42   :    for   ( i   = 0, bin = 0; i < this_fft_len; i++, bin += bin_len)

   cmp   ecx, ebp
   jge   SHORT $LN8@opt_v_GetP
   mov   edx, DWORD PTR _bin_off$[esp+16]
   add   esi, esi
   add   esi, esi
   add   ebx, edx
   mov   edx, DWORD PTR _PowerSpectrum$[esp+16]
   lea   edx, DWORD PTR [edx+ebx*4]
   npad   9
$LC3@opt_v_GetP:

; 43   :       {
; 44   : //      psNum = FreqData[0] * FreqData[0] + FreqData[1] * FreqData[1];
; 45   : //      PowerSpectrum[bin_off + bin] =   // Large cache miss here...can it be fixed?
; 46   : //      workBuf = psNum;
; 47   : //seti_britta: new statement
; 48   :       PowerSpectrum[bin_off + bin] = workBuf = (FreqData[0] * FreqData[0]) + (FreqData[1] * FreqData[1]);

   movss   xmm1, DWORD PTR [eax+ecx*8+4]
   movss   xmm0, DWORD PTR [eax+ecx*8]
   movaps   xmm2, xmm1
   mulss   xmm0, xmm0
   mulss   xmm2, xmm1
   addss   xmm0, xmm2
   movss   DWORD PTR [eax+ecx*4], xmm0
   movss   DWORD PTR [edx], xmm0
   inc   ecx
   add   edx, esi
   cmp   ecx, ebp
   jl   SHORT $LC3@opt_v_GetP
$LN8@opt_v_GetP:
   pop   esi
   pop   ebp
   pop   ebx

; 49   :
; 50   :       }
; 51   :     }

   add   esp, 8
   ret   0
?opt_v_GetPowerSpectrum@@YAXPAY01MPAMHHH@Z ENDP      ; opt_v_GetPowerSpectrum

Jason G:
hmmm, time to get out the p4 optimisation reference.  Some of the things in those SSE loops might be really good on a core/core2, don't know, but I'm a little bit wierded out by a few things  :o.  Some of the ordering of the instructions could be improved, (not entirely convinced out-of-order execution would fix that).  I also think that where I think the core2 likes moderately tight loops, as would seem to be designed, the p4 might appreciate  a further manual unroll.( or maybe the other way ! both worth a try due to different architecture generation. )

Did you use any optimisation yet ?(or is that the cleaned up version of what I did with QxN? nope different function. whew.) l'm a bit surprised at some of the code generated.

Jason

_heinz:
No hurry with all this...
all must be measured and running against the original-code version to see if there is a real progress. It is easy to destroy a well performed loop with some simple changes. As Joe mentioned, its not a good idea to use the pultime project to measure code by using the MS-compiler(Have not as my own the whole Intel Performance package, with Intel-Compiler, VTune and so on).
Using now the etimer-project to see any differences.
In this way we can better test short code-pieces.
And if we found any progress it is at least necessary to compile with Intel-compiler to see if it really rocks.

Do you like my sight of view to see equation systems looking at the code ?
Resolutions can be found by using mathematical methods.
Therefore some of my code-constructs are a little bit crazy.
But you know mathematicians are crazy people.
And I´m a mathematician.  ;D

Regards heinz 

 
 

Jason G:
No, No hurry, my holidays are coming in a few weeks, I still consider this "Orientation". 

------------------------Detour------------

--- Quote ---And I´m a mathematician.
--- End quote ---
I think the Mathematician's Anonymous meeting is three doors down ...

Mathematicians may be crazy or not, But I always wondered where my lecturers did 'stash their flagons' before class, and who knitted them the stylish brown vest that is two sizes too small :D (jokes).

If you have a formula to get crazy, I can examine its computational complexity well enough (comp sci) though a little out of practice, then build it in hardware to IPC class 3 military standards (Electronic Engineering). 

It is a shame the algorithm for sanity  is O(n^3) and  requires too many connections to implement on an FPGA, so sorry I can't help you there :D [Though I can't offer any sanity, I do have a drawer full of high speed logic I can let you dig through ....]
-------------------------------------------------------------------------

My p4 selects     PwrSpectrumOnly_ptt( sse_GetPSO_sc16_npr )    take a look it is nice, No Author's name is listed  :(.  Maybe it is  Ben & Joe?
  - It has care with the SSE pipelines (Even is laid out showing them :D)
  - It Is using many more registers (Those aren't variable assignments really, no ;) )
  - It looks to be unrolled to help the pipelines/cache/prefetch, very pretty  :D.
I'm more impressed with that function, It may help you to compare to that so you can see how to use the hardware better.

Jason





Navigation

[0] Message Index

[#] Next page

[*] Previous page

Go to full version