+- +-
Say hello if visiting :) by Gecko
11 Jan 2023, 07:43:05 pm

Seti is down again by Mike
09 Aug 2017, 10:02:44 am

Some considerations regarding OpenCL MultiBeam app tuning from algorithm view by Raistmer
11 Dec 2016, 06:30:56 am

Loading APU to the limit: performance considerations by Mike
05 Nov 2016, 06:49:26 am

Better sleep on Windows - new round by Raistmer
26 Aug 2016, 02:02:31 pm

Author Topic: optimized sources  (Read 615535 times)

Offline _heinz

  • Volunteer Developer
  • Knight who says 'Ni!'
  • *****
  • Posts: 2117
Re: optimized sources
« Reply #195 on: 30 Sep 2007, 03:35:29 pm »
The problem of s_put1_NC

Although already resolved I will give you a short impression. If you try to compile the opt_SS2.cpp with the MSC compiler you will not have sucess. The problem is the statement s_put1_NC(p, sum1 );
die typdefinitionen:
-----------------------------------------
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;


typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];   
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;

   typedef __m128 VEC;
   typedef __m128i VEC_I;







der Zeiger workBuf ist ein Zeiger auf float und erhält hier seinen Wert der auf FreqData zeigt

   float *workBuf = (float *)FreqData;

es wird sum1 definiert:
      VEC sum1, sum2;
Achtung es wird keine Variable für VEC_I definiert !!!! denke das ist der Fehler

der Zeiger p : ist ein Zeiger auf int und zeigt auf PowerSpectrum[bin_off + bin]

      int *p = (int *)(&PowerSpectrum[bin_off + bin]);



      s_put1(&workBuf, sum1);   //    workBuf = psNum;
wir lösen auf:
    #define s_put1( addr, bbbb )            _mm_store_ss( addr, bbbb )
extern void _mm_store_ss(float *_V, __m128 _A);

und setzen ein
      _mm_store_ss(&workBuf, sum1);   // alles OK soweit

      s_put1_NC(p, sum1 ); <--- Fehler
------------------------------------------------
wir lösen das Macro auf:
wir finden:
    #define s_put1_NC(ptr, aaaa)     _mm_stream_si32(ptr, s_extract_32bits(aaaa) );

wir finden:
    #define s_extract_32bits(aaaa)   _mm_cvtsi128_si32((VEC_I) aaaa)

der Befehl nach Auflösung:
----------------------------------------------
   _mm_stream_si32(p, _mm_cvtsi128_si32((sum1)); <-- VEC kann nicht in VEC_I konvertiert werden


----------------------------------------------------------------------------------------------------------------------------------------------------------
the resolution:

// ----------------------------------------------------------------------------
//   Function:   v_convert_f(int k, int *p_i, float *p_f)
//   Typ      :   void
//   Inhalt   :   convert of sum1 and write back to PowerSpectrum
//            problem of s_put1_NC solved for MSC
//   parameter:   int k, int *p_i, float *p_f
//   last update:28.05.2007         by:seti_britta ~heinz
// ----------------------------------------------------------------------------
#ifdef _MSC_VER
void v_convert_f(int k, int *p_i, float *p_f)
{
   for(k=0; k<4; k++)  // k kein festwert  !!! suchen
   {
      *p_i++ = (int) *p_f++; // p_i forwards, because it points to PowerSpectrum[bin_off + bin]
                        // p_f forwards because it points to sum1.m128_f32[0]
   }
}
#endif

// =============================================================================
//     v_GetPowerSpectrum
// seti_britta: comments for understanding, some small changes
//            problem of s_put1_NC for MSC solved
// =============================================================================

GetPowerSpectrum_ptt( sse2_v_GetPowerSpectrum )
{
   float *workBuf = (float *)FreqData;
   register int   i, bin;   //seti_britta: hold var in register
   int *p; //seti_britta: out of the loop
   
   VEC sum1, sum2;      //seti_britta: moved to here
   sum1=sum2= ZERO;   //seti_britta: init ---> no warnings

   ALIGNED_YES( FreqData );
   ALIGNED_YES( PowerSpectrum );

#if defined( _MSC_VER )
   float *p_f1 = (float *)(sum1.m128_f32);  //seti_britta:new
   register int *p_i;
   register int k;
   k = 0;
#endif
   // seti_britta: let the loop run to the value of this_fft_len
   for   ( i   = 0, bin = 0; i < this_fft_len; i++, bin += bin_len)
   {
      p = (int *)(&PowerSpectrum[bin_off + bin]); //seti_britta: int *p out of the loop
#if defined( _MSC_VER )
      p_i = (int *)(&PowerSpectrum[bin_off + bin]); //seti_britta:new
#endif
      s_fetch( &FreqData[i+16][0] );   // get float data from FreqData
      sum1 = s_get1(&FreqData[0]);   // get FreqData[0] first row, first 4 elements to sum1
      sum1 = s_mult(sum1, sum1);      // power of sum1 and store to sum1, overwritten now
      sum2 = s_get1(&FreqData[1]);   // get FreqData[1] first row, second element to sum2
      sum2 = s_mult(sum2, sum2);      // power of sum2 and store to sum2, overwritten now
      sum1 = s_add(sum1, sum2);      // add both power values sum1 and sum2 and store to sum1
         // WARNING: !! this store overwrites FreqData[0], so loop must go bottom to top !!
         //  reusing buffer - not needed after our psNum compute.
      s_put1(&workBuf, sum1);   //    workBuf = psNum; store sum1 to workBuf
#if defined( _MSC_VER )
      v_convert_f(k, p_i, p_f1);   //seti_britta: new, convert function with write back to PowerSpectrum
#else
      s_put1_NC(p, sum1 );
#endif
   }

      // When using non caching writes (non-temporal), you should allways force
      // the writes to be "globaly visible" to possible other CPUs
   s_fence_writes();
}
-----------------------------------------------------------------------
and so on for sum1 till sum4 analog .......

If anybody of you have a better solution let it me know
heinz  ;)

Offline Jason G

  • Construction Fraggle
  • Knight who says 'Ni!'
  • *****
  • Posts: 8980
Re: optimized sources
« Reply #196 on: 04 Oct 2007, 05:26:49 am »
Any luck with that s_put1_NC(p,  sum1 ) call?  I haven't looked at this code, but the types are local, what does it break if you just change them... VEC_I sum1, sume2 and, VEC_I * p ? is powerspectrum not aligned ?
« Last Edit: 04 Oct 2007, 05:30:36 am by j_groothu »

Offline _heinz

  • Volunteer Developer
  • Knight who says 'Ni!'
  • *****
  • Posts: 2117
Re: optimized sources
« Reply #197 on: 04 Oct 2007, 07:45:00 pm »
@ Jason,
I had sucess, it compiled and linked sucessful. But you need do nothing here, you have the Intel Compiler.
If you look into the code you can see that  Powerspectrum is aligned --->
  ALIGNED_YES( PowerSpectrum );
I took the var definitions out of the block at the beginning, that reduce prolog and epilog of the block.

heinz   ;)
« Last Edit: 04 Oct 2007, 08:28:19 pm by seti_britta »

Offline _heinz

  • Volunteer Developer
  • Knight who says 'Ni!'
  • *****
  • Posts: 2117
Re: optimized sources
« Reply #198 on: 08 Oct 2007, 08:33:05 pm »
Compiler Option /LTCG 
how you can use it to optimize your app
klick here

heinz

Offline Jason G

  • Construction Fraggle
  • Knight who says 'Ni!'
  • *****
  • Posts: 8980
Re: optimized sources
« Reply #199 on: 08 Oct 2007, 09:01:49 pm »
Yeah, works good sometimes, but you have to look carefully at the output because sometimes it does silly things .... and that is hard for link time because there is no source.

Offline Jason G

  • Construction Fraggle
  • Knight who says 'Ni!'
  • *****
  • Posts: 8980
Re: optimized sources
« Reply #200 on: 08 Oct 2007, 11:39:08 pm »
Any tips at all on handling cache thrashing on an early (non HT) p4 ? I waiting on my second profile set, up to run 7 of 19  ::)

Offline _heinz

  • Volunteer Developer
  • Knight who says 'Ni!'
  • *****
  • Posts: 2117
Re: optimized sources
« Reply #201 on: 11 Oct 2007, 07:02:00 pm »
As you all know a well structured data-set can better and faster  handled  than a not well structured dataset.
Aligned data are very important for optimal performance. We can help the compiler in it´s work, if we make some rules for data definitions
1.) write all structures together, one after the other
2.) write all double together, one after the other
3.) write all float together, one after the other
4.) write all int together, one after the other
5.) write all char together, one after the other
6.) write all bool together, one after the other

don´t mix definitions and codepieces
don´t make definitions in loops
hold vars global to reduce prolog and epilog of blocks
avoid type-convert

organize  same data-fields as vectors and n-dimensional matrices.
Use structures for vars and datas because it is easy to align structures.

Use structured programming methods for the code.

regards heinz



Offline Jason G

  • Construction Fraggle
  • Knight who says 'Ni!'
  • *****
  • Posts: 8980
Re: optimized sources
« Reply #202 on: 11 Oct 2007, 07:13:43 pm »
HAHAHA,  I never saw some of those bad things before this week. I know exactly what you mean.

Offline _heinz

  • Volunteer Developer
  • Knight who says 'Ni!'
  • *****
  • Posts: 2117
Re: optimized sources
« Reply #203 on: 11 Oct 2007, 07:53:13 pm »
@jason,
sure, this is not the answer to your question... it is common

Offline Jason G

  • Construction Fraggle
  • Knight who says 'Ni!'
  • *****
  • Posts: 8980
Re: optimized sources
« Reply #204 on: 11 Oct 2007, 08:05:05 pm »
Sadly true, It takes a long time to understand the code this way.

Offline _heinz

  • Volunteer Developer
  • Knight who says 'Ni!'
  • *****
  • Posts: 2117
Re: optimized sources
« Reply #205 on: 11 Oct 2007, 08:39:56 pm »
the most important part is analyzeFuncs.cpp
I did my best for a liitle more structure in it for better reading and understanding
here are some short comments
// here all inline functions
// ============================================================================
// seti_britta: set inline functions direct before the main fkt
// hint:  you find the other functions  behind the closing brace of main fkt
// ----------------------------------------------------------------------------
//      order to find               used in:
//      ------------------------      -----------------------------
//      getMTFL                     do_generate_chirp_fft_pairs
//      load_wisdom                  do_generate_fft_coeff
//      save_wisdom                  do_generate_fft_coeff
//                              do_generate_chirp_fft_pairs
//      notify_user                  seti_analyze
//                              do_generate_fft_coeff
//                              do_generate_chirp_fft_pairs
//                              do_chirping_data
//                              do_return_best_of_signals
//      do_generate_fft_coeff         seti_analyze
//      do_generate_chirp_fft_pairs      seti_analyze
//      do_chirping_data            seti_analyze
//      do_transpose               seti_analyze
//      process_data               seti_analyze
//      do_analyse_pot               seti_analyze
//      do_return_best_of_signals      seti_analyze
//      stats_output               do_generate_chirp_fft_pairs
//
//
// ============================================================================
all functions have now heads like this ----->
// ----------------------------------------------------------------------------
//   Function:   getMTFL
//   Typ      :   int
//   Inhalt   :   Find maximum FFT length for which transpose of PowerSpectrum
//            is needed
//   parameter:   int maxFFTLen
//   last update:         by:
// ----------------------------------------------------------------------------

and that is my actual main loop ---->
// ----------------------------------------------------------------------------
//   Function:   seti_analyze
//   Typ      :   int
//   Inhalt   :   seti_analyze
//         The main analysis function. Args: state pointer to data, # of
//         points, starting chirp/fftlen Must be called with unchirped data;
//         this function modifies (chirps) the data in place swi parsed WU header         
//   parameter:   ANALYSIS_STATE &state
//
//   last update:18.06.2007   by:seti_britta
// ----------------------------------------------------------------------------
// Part 1   allocation and init
// Part 2   generate fft coefficients, save into wisdom
// Part 3   generate chirp/fft pairs, do different calcs in preparation analyze
// Part 4   loop through chirp/fft pairs - this is the top level analysis loop.
// Part 4.1 chirping data
// Part 4.2 do transpose if needed
// Part 4.3 process data
// Part 4.4 analyze power over time (POT), set checkpoint
// Part 5.   return the "best of" signals and do the rest
//
// ----------------------------------------------------------------------------
int seti_analyze( ANALYSIS_STATE &state )
{
// Part 1   allocation and init
    bitfield    = swi.analysis_cfg.analysis_fft_lengths;
    DataIn      = state.savedWUData;
    NumDataPoints = state.npoints;
   ChirpedData   = NULL;
    WorkData      = NULL;
    PowerSpectrum = NULL;
    num_cfft = retval = 0;
    MinChirpStep  = 0.0;
    last_chirp_ind = -1 << 20;
    cputime0 = 0;
   int have_transpose = false;   // seti_britta: used in: do_transpose(); process_data();
   d_log2 = log ( 2.0 );
    #if defined( USE_IPP )
        ippStaticInit();        // initialization of IPP library
    #elif defined( USE_FFTWF )
        // plan space for fftw
        // fftwf_plan  analysis_plans[MAX_NUM_FFTS]; //now out external
    #else
        // fields need by the ooura fft logic
        int         *BitRevTab[MAX_NUM_FFTS];
        float       *CoeffTab[MAX_NUM_FFTS];
    #endif
    ChirpedData = state.data;
    PowerSpectrum = ( float * ) calloc_a(NumDataPoints, sizeof(float), MEM_ALIGN);
    if (PowerSpectrum == NULL) SETIERROR(MALLOC_FAILED, "PowerSpectrum == NULL");
    notify_user( "Choosing optimal functions" );   
    CacheChirpCalc  = optimize_init(); // choose fastest function
// end Part 1   allocation and init
   do_generate_fft_coeff();// Part 2 generate fft coefficients, save into wisdom
   do_generate_chirp_fft_pairs();   // Part 3 generate chirp/fft pairs
// Part 4   loop through chirp/fft pairs - this is the top level analysis loop.
   chirp_units = 0;
    for ( icfft = state.icfft; icfft < num_cfft; icfft++ )// the big loop
    {
      do_chirping_data();   // Part 4.1 chirping data
        if (fftlen <= MaxTransposeFftLen)
         do_transpose();   // Part 4.2 do tanspose, use strips of 4
      process_data();      // Part 4.3 process data
      do_analyse_pot();   // Part 4.4 do analyze pot
   }// end loop over chirp/fftlen paris
   do_return_best_of_signals();// Part 5 return "best of" signals and do the rest
return retval; // finish seti_analyze
}   // end of seti_analyze
// ============================================================================
// seti_britta: here after the closing brace of the main fkt are the functions
// you find it in the following order:            used in:
//      enough_ram                           not found
//      v_BaseLineSmooth                     do_generate_chirp_fft_pairs
//      GetPowerSpectrum_ptt                  not found
//      PwrSpectrumOnly_ptt                     not found
//      TransposeStrip_ptt                     not found
//      v_subTranspose                        TransposeStrip_ptt
//      TransposeStrip_ptt( orig_v_Transpose2 )      not found
//      TransposeStrip_ptt( orig_v_Transpose4 )      not found
//
//
//
//hint: functions which are not found will be used in other sourcefiles.
// ----------------------------------------------------------------------------


hoping that helps
regards heinz  ;)

Offline Jason G

  • Construction Fraggle
  • Knight who says 'Ni!'
  • *****
  • Posts: 8980
Re: optimized sources
« Reply #206 on: 11 Oct 2007, 08:43:59 pm »
Much nicer thank you :D

Offline Jason G

  • Construction Fraggle
  • Knight who says 'Ni!'
  • *****
  • Posts: 8980
Re: optimized sources
« Reply #207 on: 12 Oct 2007, 11:26:09 am »
Quote
// Part 1   allocation and init
// Part 2   generate fft coefficients, save into wisdom
// Part 3   generate chirp/fft pairs, do different calcs in preparation analyze
// Part 4   loop through chirp/fft pairs - this is the top level analysis loop.
// Part 4.1 chirping data
// Part 4.2 do transpose if needed
// Part 4.3 process data
// Part 4.4 analyze power over time (POT), set checkpoint
// Part 5.   return the "best of" signals and do the rest

Here you are starting to see some encapsulation of the underlying processes, within which are the optimised routines.  I am starting to think some classes will help for those,  instead of the pointer juggling table.

Just inital thoughts,
    optimalChirpFunc = chirpFuncCollection.bestChirp(...);
    cerr << "Optimal Chirping Function Chosen: " << optimalChirpFunc.name() << endl;
    ...
    optimalChirpFunc.doChirp(...);
 

Jason
« Last Edit: 12 Oct 2007, 11:36:39 am by j_groothu »

Offline _heinz

  • Volunteer Developer
  • Knight who says 'Ni!'
  • *****
  • Posts: 2117
Re: optimized sources
« Reply #208 on: 22 Oct 2007, 09:08:11 pm »
For better understanding the benchmark and the complexity of the optimizing process I compiled FFTW-3.1.2 for Windows using VS2005. See attachment ---> FFTW.7z
here is a first result ---> benchf_sse.exe -opatient 64 128 256 512 1024 2048 4096
fftw-3.1.2 benchfsse started
Problem: 64, setup: 29.83 ms, time: 2.26 us, ``mflops'': 849.14
Problem: 128, setup: 68.43 ms, time: 6.43 us, ``mflops'': 697.23
Problem: 256, setup: 192.97 ms, time: 14.04 us, ``mflops'': 729.44
Problem: 512, setup: 383.39 ms, time: 30.87 us, ``mflops'': 746.36
Problem: 1024, setup: 886.80 ms, time: 70.96 us, ``mflops'': 721.55
Problem: 2048, setup: 2.18 s, time: 155.05 us, ``mflops'': 726.49
Problem: 4096, setup: 5.75 s, time: 339.71 us, ``mflops'': 723.44
fftw-3.1.2 benchfsse ended.
------------------------------------------------------------------------------------------------------------------------
If you want to know what is going on here you can read the manual there
have fun
regards heinz   ;D

[attachment deleted by admin]

Offline Jason G

  • Construction Fraggle
  • Knight who says 'Ni!'
  • *****
  • Posts: 8980
Re: optimized sources
« Reply #209 on: 26 Oct 2007, 08:16:21 am »
Nice one, I'll be taking a look at that too soon, as the FFTs (Using Intel IPP at the moment) and the Pulse Folding (mostly selects AK varieties in tests) are generating much cache issues with my old machines.  As I have worked with custom FFT's before it'll be interesting to poke around in there (Difficult to try with IPP  ;) ) to see how far things have come. 

The more Intel Literature I'm reading is suggesting significant speedups will be possible for my old p4's.  With specific optimising techniques a possible 3+ times speedup of certain types of loops..  I'll set my goals low and settle for a 5 to 10% crunch time improvement across angle ranges :)

The problems I've managed to identify so far in the  inner loops in the FFT and folding are p4 specific, but apparently apply to some (or all)  of the p4 based xeons as well (and of course p4 based celerons too) .  Looking at BoincStats, If that's anything to go by,  that's one heck of a lot of active machines. :o

  I am a bit surprised that Intel's own IPP is doing this, of course they've moved on to newer faster architectures, perhaps there are some implementation specific aspects of IPP I haven't come across yet, that'll all be fun to find out ....

I still will examine the costly memcopies further but may try integrating them using some of the processing methods put forward by Joe Segur, If I get the chance by christmas time.  There are limited options for further parallelisation on my old single core beasts, and they look like a good one.

Keep going, I'm still paying attention when I can :D  Even though you are working a different platform the approach still help my very gradual understanding.

Jason

 

Welcome, Guest.
Please login or register.
 
 
 
Forgot your password?
Members
Total Members: 97
Latest: ToeBee
New This Month: 0
New This Week: 0
New Today: 0
Stats
Total Posts: 59559
Total Topics: 1672
Most Online Today: 355
Most Online Ever: 983
(20 Jan 2020, 03:17:55 pm)
Users Online
Members: 0
Guests: 33
Total: 33
Powered by EzPortal