Forum > Windows

optimized sources

<< < (40/179) > >>

_heinz:
The problem of s_put1_NC

Although already resolved I will give you a short impression. If you try to compile the opt_SS2.cpp with the MSC compiler you will not have sucess. The problem is the statement s_put1_NC(p, sum1 );
die typdefinitionen:
-----------------------------------------
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;


typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];   
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;

   typedef __m128 VEC;
   typedef __m128i VEC_I;







der Zeiger workBuf ist ein Zeiger auf float und erhält hier seinen Wert der auf FreqData zeigt

   float *workBuf = (float *)FreqData;

es wird sum1 definiert:
      VEC sum1, sum2;
Achtung es wird keine Variable für VEC_I definiert !!!! denke das ist der Fehler

der Zeiger p : ist ein Zeiger auf int und zeigt auf PowerSpectrum[bin_off + bin]

      int *p = (int *)(&PowerSpectrum[bin_off + bin]);



      s_put1(&workBuf, sum1);   //    workBuf = psNum;
wir lösen auf:
    #define s_put1( addr, bbbb )            _mm_store_ss( addr, bbbb )
extern void _mm_store_ss(float *_V, __m128 _A);

und setzen ein
      _mm_store_ss(&workBuf, sum1);   // alles OK soweit

      s_put1_NC(p, sum1 ); <--- Fehler
------------------------------------------------
wir lösen das Macro auf:
wir finden:
    #define s_put1_NC(ptr, aaaa)     _mm_stream_si32(ptr, s_extract_32bits(aaaa) );

wir finden:
    #define s_extract_32bits(aaaa)   _mm_cvtsi128_si32((VEC_I) aaaa)

der Befehl nach Auflösung:
----------------------------------------------
   _mm_stream_si32(p, _mm_cvtsi128_si32((sum1)); <-- VEC kann nicht in VEC_I konvertiert werden


----------------------------------------------------------------------------------------------------------------------------------------------------------
the resolution:

// ----------------------------------------------------------------------------
//   Function:   v_convert_f(int k, int *p_i, float *p_f)
//   Typ      :   void
//   Inhalt   :   convert of sum1 and write back to PowerSpectrum
//            problem of s_put1_NC solved for MSC
//   parameter:   int k, int *p_i, float *p_f
//   last update:28.05.2007         by:seti_britta ~heinz
// ----------------------------------------------------------------------------
#ifdef _MSC_VER
void v_convert_f(int k, int *p_i, float *p_f)
{
   for(k=0; k<4; k++)  // k kein festwert  !!! suchen
   {
      *p_i++ = (int) *p_f++; // p_i forwards, because it points to PowerSpectrum[bin_off + bin]
                        // p_f forwards because it points to sum1.m128_f32[0]
   }
}
#endif

// =============================================================================
//     v_GetPowerSpectrum
// seti_britta: comments for understanding, some small changes
//            problem of s_put1_NC for MSC solved
// =============================================================================

GetPowerSpectrum_ptt( sse2_v_GetPowerSpectrum )
{
   float *workBuf = (float *)FreqData;
   register int   i, bin;   //seti_britta: hold var in register
   int *p; //seti_britta: out of the loop
   
   VEC sum1, sum2;      //seti_britta: moved to here
   sum1=sum2= ZERO;   //seti_britta: init ---> no warnings

   ALIGNED_YES( FreqData );
   ALIGNED_YES( PowerSpectrum );

#if defined( _MSC_VER )
   float *p_f1 = (float *)(sum1.m128_f32);  //seti_britta:new
   register int *p_i;
   register int k;
   k = 0;
#endif
   // seti_britta: let the loop run to the value of this_fft_len
   for   ( i   = 0, bin = 0; i < this_fft_len; i++, bin += bin_len)
   {
      p = (int *)(&PowerSpectrum[bin_off + bin]); //seti_britta: int *p out of the loop
#if defined( _MSC_VER )
      p_i = (int *)(&PowerSpectrum[bin_off + bin]); //seti_britta:new
#endif
      s_fetch( &FreqData[i+16][0] );   // get float data from FreqData
      sum1 = s_get1(&FreqData[0]);   // get FreqData[0] first row, first 4 elements to sum1
      sum1 = s_mult(sum1, sum1);      // power of sum1 and store to sum1, overwritten now
      sum2 = s_get1(&FreqData[1]);   // get FreqData[1] first row, second element to sum2
      sum2 = s_mult(sum2, sum2);      // power of sum2 and store to sum2, overwritten now
      sum1 = s_add(sum1, sum2);      // add both power values sum1 and sum2 and store to sum1
         // WARNING: !! this store overwrites FreqData[0], so loop must go bottom to top !!
         //  reusing buffer - not needed after our psNum compute.
      s_put1(&workBuf, sum1);   //    workBuf = psNum; store sum1 to workBuf
#if defined( _MSC_VER )
      v_convert_f(k, p_i, p_f1);   //seti_britta: new, convert function with write back to PowerSpectrum
#else
      s_put1_NC(p, sum1 );
#endif
   }

      // When using non caching writes (non-temporal), you should allways force
      // the writes to be "globaly visible" to possible other CPUs
   s_fence_writes();
}
-----------------------------------------------------------------------
and so on for sum1 till sum4 analog .......

If anybody of you have a better solution let it me know
heinz  ;)

Jason G:
Any luck with that s_put1_NC(p,  sum1 ) call?  I haven't looked at this code, but the types are local, what does it break if you just change them... VEC_I sum1, sume2 and, VEC_I * p ? is powerspectrum not aligned ?

_heinz:
@ Jason,
I had sucess, it compiled and linked sucessful. But you need do nothing here, you have the Intel Compiler.
If you look into the code you can see that  Powerspectrum is aligned --->
  ALIGNED_YES( PowerSpectrum );
I took the var definitions out of the block at the beginning, that reduce prolog and epilog of the block.

heinz   ;)

_heinz:
Compiler Option /LTCG 
how you can use it to optimize your app
klick here

heinz

Jason G:
Yeah, works good sometimes, but you have to look carefully at the output because sometimes it does silly things .... and that is hard for link time because there is no source.

Navigation

[0] Message Index

[#] Next page

[*] Previous page

Go to full version