Forum > Windows
optimized sources
_heinz:
The problem of s_put1_NC
Although already resolved I will give you a short impression. If you try to compile the opt_SS2.cpp with the MSC compiler you will not have sucess. The problem is the statement s_put1_NC(p, sum1 );
die typdefinitionen:
-----------------------------------------
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
float m128_f32[4];
unsigned __int64 m128_u64[2];
__int8 m128_i8[16];
__int16 m128_i16[8];
__int32 m128_i32[4];
__int64 m128_i64[2];
unsigned __int8 m128_u8[16];
unsigned __int16 m128_u16[8];
unsigned __int32 m128_u32[4];
} __m128;
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
__int8 m128i_i8[16];
__int16 m128i_i16[8];
__int32 m128i_i32[4];
__int64 m128i_i64[2];
unsigned __int8 m128i_u8[16];
unsigned __int16 m128i_u16[8];
unsigned __int32 m128i_u32[4];
unsigned __int64 m128i_u64[2];
} __m128i;
typedef __m128 VEC;
typedef __m128i VEC_I;
der Zeiger workBuf ist ein Zeiger auf float und erhält hier seinen Wert der auf FreqData zeigt
float *workBuf = (float *)FreqData;
es wird sum1 definiert:
VEC sum1, sum2;
Achtung es wird keine Variable für VEC_I definiert !!!! denke das ist der Fehler
der Zeiger p : ist ein Zeiger auf int und zeigt auf PowerSpectrum[bin_off + bin]
int *p = (int *)(&PowerSpectrum[bin_off + bin]);
s_put1(&workBuf, sum1); // workBuf = psNum;
wir lösen auf:
#define s_put1( addr, bbbb ) _mm_store_ss( addr, bbbb )
extern void _mm_store_ss(float *_V, __m128 _A);
und setzen ein
_mm_store_ss(&workBuf, sum1); // alles OK soweit
s_put1_NC(p, sum1 ); <--- Fehler
------------------------------------------------
wir lösen das Macro auf:
wir finden:
#define s_put1_NC(ptr, aaaa) _mm_stream_si32(ptr, s_extract_32bits(aaaa) );
wir finden:
#define s_extract_32bits(aaaa) _mm_cvtsi128_si32((VEC_I) aaaa)
der Befehl nach Auflösung:
----------------------------------------------
_mm_stream_si32(p, _mm_cvtsi128_si32((sum1)); <-- VEC kann nicht in VEC_I konvertiert werden
----------------------------------------------------------------------------------------------------------------------------------------------------------
the resolution:
// ----------------------------------------------------------------------------
// Function: v_convert_f(int k, int *p_i, float *p_f)
// Typ : void
// Inhalt : convert of sum1 and write back to PowerSpectrum
// problem of s_put1_NC solved for MSC
// parameter: int k, int *p_i, float *p_f
// last update:28.05.2007 by:seti_britta ~heinz
// ----------------------------------------------------------------------------
#ifdef _MSC_VER
void v_convert_f(int k, int *p_i, float *p_f)
{
for(k=0; k<4; k++) // k kein festwert !!! suchen
{
*p_i++ = (int) *p_f++; // p_i forwards, because it points to PowerSpectrum[bin_off + bin]
// p_f forwards because it points to sum1.m128_f32[0]
}
}
#endif
// =============================================================================
// v_GetPowerSpectrum
// seti_britta: comments for understanding, some small changes
// problem of s_put1_NC for MSC solved
// =============================================================================
GetPowerSpectrum_ptt( sse2_v_GetPowerSpectrum )
{
float *workBuf = (float *)FreqData;
register int i, bin; //seti_britta: hold var in register
int *p; //seti_britta: out of the loop
VEC sum1, sum2; //seti_britta: moved to here
sum1=sum2= ZERO; //seti_britta: init ---> no warnings
ALIGNED_YES( FreqData );
ALIGNED_YES( PowerSpectrum );
#if defined( _MSC_VER )
float *p_f1 = (float *)(sum1.m128_f32); //seti_britta:new
register int *p_i;
register int k;
k = 0;
#endif
// seti_britta: let the loop run to the value of this_fft_len
for ( i = 0, bin = 0; i < this_fft_len; i++, bin += bin_len)
{
p = (int *)(&PowerSpectrum[bin_off + bin]); //seti_britta: int *p out of the loop
#if defined( _MSC_VER )
p_i = (int *)(&PowerSpectrum[bin_off + bin]); //seti_britta:new
#endif
s_fetch( &FreqData[i+16][0] ); // get float data from FreqData
sum1 = s_get1(&FreqData[0]); // get FreqData[0] first row, first 4 elements to sum1
sum1 = s_mult(sum1, sum1); // power of sum1 and store to sum1, overwritten now
sum2 = s_get1(&FreqData[1]); // get FreqData[1] first row, second element to sum2
sum2 = s_mult(sum2, sum2); // power of sum2 and store to sum2, overwritten now
sum1 = s_add(sum1, sum2); // add both power values sum1 and sum2 and store to sum1
// WARNING: !! this store overwrites FreqData[0], so loop must go bottom to top !!
// reusing buffer - not needed after our psNum compute.
s_put1(&workBuf, sum1); // workBuf = psNum; store sum1 to workBuf
#if defined( _MSC_VER )
v_convert_f(k, p_i, p_f1); //seti_britta: new, convert function with write back to PowerSpectrum
#else
s_put1_NC(p, sum1 );
#endif
}
// When using non caching writes (non-temporal), you should allways force
// the writes to be "globaly visible" to possible other CPUs
s_fence_writes();
}
-----------------------------------------------------------------------
and so on for sum1 till sum4 analog .......
If anybody of you have a better solution let it me know
heinz ;)
Jason G:
Any luck with that s_put1_NC(p, sum1 ) call? I haven't looked at this code, but the types are local, what does it break if you just change them... VEC_I sum1, sume2 and, VEC_I * p ? is powerspectrum not aligned ?
_heinz:
@ Jason,
I had sucess, it compiled and linked sucessful. But you need do nothing here, you have the Intel Compiler.
If you look into the code you can see that Powerspectrum is aligned --->
ALIGNED_YES( PowerSpectrum );
I took the var definitions out of the block at the beginning, that reduce prolog and epilog of the block.
heinz ;)
_heinz:
Compiler Option /LTCG
how you can use it to optimize your app
klick here
heinz
Jason G:
Yeah, works good sometimes, but you have to look carefully at the output because sometimes it does silly things .... and that is hard for link time because there is no source.
Navigation
[0] Message Index
[#] Next page
[*] Previous page
Go to full version