//***************************************************************************************
// Administrative routines. 
//***************************************************************************************
//
// Author: Rob F. Van der Wijngaart
//         Intel Corporation
// Date:   008/30/2010
//
//***************************************************************************************
//
// 
// Copyright 2010 Intel Corporation
// 
//    Licensed under the Apache License, Version 2.0 (the "License");
//    you may not use this file except in compliance with the License.
//    You may obtain a copy of the License at
// 
//        http://www.apache.org/licenses/LICENSE-2.0
// 
//    Unless required by applicable law or agreed to in writing, software
//    distributed under the License is distributed on an "AS IS" BASIS,
//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//    See the License for the specific language governing permissions and
//    limitations under the License.
// 
#include "RCCE_lib.h"
#ifdef RC_POWER_MANAGEMENT
  #include "RCCE_lib_pwr.h"
#endif

#ifdef COPPERRIDGE
#ifndef SCC
#define SCC
#endif
#endif

#ifdef SCC
  #include <unistd.h>
  #include <stdlib.h>
  #include <stdint.h>
  #include <limits.h>
#ifndef __hermit__
  #include <sys/mman.h>
  #include "SCC_API.h"
#else
  #define RCCE_SESSION_ID	42
  #include "syscall.h"
  extern unsigned int get_cpufreq();
#endif
#endif
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <fcntl.h>

// En-/ or disable debug prints...
#define DEBUG 1
#define LOCKDEBUG 1

#undef SHMDBG

#ifdef __hermit__
static inline int tas(t_vcharp lock)
{
	register unsigned char _res = 1;

	asm volatile(
                "lock; xchgb %0,%1"
		: "=q"(_res), "=m"(*lock)
		: "0"(_res));
	return (int) _res;
}
#define Test_and_Set(a) tas(virtual_lockaddress[a])
#elif defined(SCC)
// Test and Set method
#define Test_and_Set(a) ((*(virtual_lockaddress[a])) & 0x01)
#endif
#define BACKOFF_MIN 8
#define BACKOFF_MAX 256

#ifdef __hermit__
typedef struct islelock {
	// Internal queue
	int32_t queue;
	// Internal dequeue
	int32_t dequeue;
} islelock_t;

extern islelock_t* rcce_lock;

/*
 *  * Use a own implementation of "atomic_add_return" to gurantee
 *   * that the lock prefix is used.
 *    */
inline static int _hermit_atomic_add(int32_t *d, int i)
{
	int res = i;
	asm volatile("lock; xaddl %0, %1" : "=r"(i) : "m"(*d), "0"(i) : "memory", "cc");
	return res+i;
}

static inline int islelock_lock(void)
{
	int ticket;

	ticket = _hermit_atomic_add(&rcce_lock->queue, 1);
	while(rcce_lock->dequeue != ticket) {
		asm volatile ("pause");
	}

	return 0;
}

static inline int islelock_unlock(void)
{
	_hermit_atomic_add(&rcce_lock->dequeue, 1);

	return 0;
}
#endif

//......................................................................................
// GLOBAL VARIABLES USED BY THE LIBRARY
//......................................................................................
unsigned int next;
int       RCCE_NP;               // number of participating cores
int       RCCE_DEVICE_NR;             // device number of the scc board
int       RCCE_NUM_DEVICES;      // total number of scc boards involved
int       RCCE_NUM_UES_DEVICE[RCCE_MAX_BOARDS]; // number of participating cores per board
int       RCCE_UE_TO_DEVICE[RCCE_MAXNP]; // device id of each core
int       RCCE_DEVICE_LOCAL_UE;  // device-local core id
double    RC_REFCLOCKGHZ;        // baseline CPU frequency (GHz)
int       RC_MY_COREID;          // physical ID of calling core
int       RC_COREID[RCCE_MAXNP]; // array of physical core IDs for all participating 
                                 // cores, sorted by rank
int       RCCE_IAM=-1;           // rank of calling core (invalid by default)
RCCE_COMM RCCE_COMM_WORLD;       // predefined global communicator
int       RCCE_BUFF_SIZE;        // available MPB size
t_vcharp  RCCE_comm_buffer[RCCE_MAXNP]; // starts of MPB, sorted by rank
#ifndef __hermit__
//#ifdef USE_FLAG_EXPERIMENTAL
t_vcharp  RCCE_flag_buffer[RCCE_MAXNP];
//#endif
#endif
#ifndef GORY
  // ......................... non-GORY communication mode .............................
  // synchronization flags are predefined and maintained by the library
  RCCE_FLAG RCCE_sent_flag[RCCE_MAXNP], RCCE_ready_flag[RCCE_MAXNP];
#ifdef USE_PIPELINE_FLAGS
  RCCE_FLAG RCCE_sent_flag_pipe[RCCE_MAXNP], RCCE_ready_flag_pipe[RCCE_MAXNP];
#endif
#ifdef USE_PROBE_FLAGS
  RCCE_FLAG RCCE_probe_flag[RCCE_MAXNP];
#endif
  RCCE_FLAG RCCE_barrier_flag[RCCE_MAXNP];
  RCCE_FLAG RCCE_barrier_release_flag;
  // payload part of the MPBs starts at a specific address, not malloced space
  t_vcharp RCCE_buff_ptr;
  // maximum chunk size of message payload is also specified
  size_t RCCE_chunk;
  // synchronization flags will be allocated at this address
  t_vcharp  RCCE_flags_start;

#ifndef USE_REMOTE_PUT_LOCAL_GET
  // send request queue
  RCCE_SEND_REQUEST* RCCE_send_queue;
  // recv request queue
  RCCE_RECV_REQUEST* RCCE_recv_queue[RCCE_MAXNP];
#else
  // send request queue
  RCCE_SEND_REQUEST* RCCE_send_queue[RCCE_MAXNP];
  // recv request queue
  RCCE_RECV_REQUEST* RCCE_recv_queue;
#endif

#endif // !GORY

#ifndef __hermit__
t_vcharp RCCE_fool_write_combine_buffer;
#endif
// int air_counter = 0;

#ifdef SCC
  // virtual addresses of test&set registers
  t_vcharp virtual_lockaddress[RCCE_MAXNP];
#endif
//......................................................................................
// END GLOBAL VARIABLES USED BY THE LIBRARY
//......................................................................................

#ifdef SCC
#ifdef __hermit__
inline volatile uint64_t _rdtsc() {
	register uint64_t lo, hi;
	asm volatile ("rdtsc" : "=a"(lo), "=d"(hi) );
	return ((uint64_t)hi << 32ULL | (uint64_t)lo);
}
#elif defined(__INTEL_COMPILER)
    inline volatile long long _rdtsc() {
      register long long TSC __asm__("eax");
      __asm__ volatile (".byte 15, 49" : : : "eax", "edx");
      return TSC;
    }
#endif
#endif

//--------------------------------------------------------------------------------------
// FUNCTION: RC_cache_invalidate
//--------------------------------------------------------------------------------------
// invalidate (not flush!) lines in L1 that map to MPB lines
//--------------------------------------------------------------------------------------
#ifndef __hermit__
void RC_cache_invalidate() {
#ifdef SCC
  __asm__ volatile ( ".byte 0x0f; .byte 0x0a;\n" ); // CL1FLUSHMB
#endif
  return;
}
#endif

static inline void RC_wait(int wait) {
#ifdef __hermit__
  asm volatile(	      "movq %%rax, %%rcx\n\t"
		      "L1: nop\n\t"
		      "loop L1"
		      : /* no output registers */
		      : "a" (wait)
		      : "%rcx" );
#else
  asm volatile(       "movl %%eax,%%ecx\n\t"
                      "L1: nop\n\t"
                      "loop L1"
                      : /* no output registers */
                      : "a" (wait)
                      : "%ecx" );
  return;
#endif
}

//--------------------------------------------------------------------------------------
// FUNCTION: RC_COMM_BUFFER_SIZE
//--------------------------------------------------------------------------------------
// return total available MPB size on chip
//--------------------------------------------------------------------------------------
int RC_COMM_BUFFER_SIZE() {
  return RCCE_BUFF_SIZE_MAX*RCCE_MAXNP;
}

//--------------------------------------------------------------------------------------
// FUNCTION: RC_COMM_BUFFER_START
//--------------------------------------------------------------------------------------
// return (virtual) start address of MPB for UE with rank ue
//--------------------------------------------------------------------------------------
t_vcharp RC_COMM_BUFFER_START(int ue){
#ifdef __hermit__
  t_vcharp retval;
  retval =  (t_vcharp) sys_rcce_malloc(RCCE_SESSION_ID, RC_COREID[ue]);
  if (!retval) {
    fprintf(stderr, "rcce_malloc failed\n");
    RCCE_finalize();
    exit(1);
  }
  return retval;
#elif defined(SCC)
  // "Allocate" MPB, using memory mapping of physical addresses
  t_vcharp retval;
#ifndef SCC_COUPLED_SYSTEMS
  MPBalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]), 
           (X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) && 
           (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM]))
          );
#else
  MPBalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]), RC_COREID[ue] / RCCE_MAXNP_PER_BOARD, RCCE_DEVICE_NR,
	   (X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) && 
           (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM]))
          );
#endif
  return retval;
#else
  // even in functional emulation mode we leave gaps in the global MPB
  return RC_comm_buffer + RC_COREID[ue]*RC_COMM_BUFFER_SIZE()/RCCE_MAXNP;
#endif
}

#ifndef __hermit__
//#ifdef USE_FLAG_EXPERIMENTAL
t_vcharp RC_FLAG_BUFFER_START(int ue){
  // "Allocate" MPB, using memory mapping of physical addresses
  t_vcharp retval;
#if SCC_COUPLED_SYSTEMS
  FLAGalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]), RC_COREID[ue] / RCCE_MAXNP_PER_BOARD, RCCE_DEVICE_NR, (X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) &&
             (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM]))
           );
#else
  FLAGalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]),(X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) &&
             (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM]))
           );
#endif
  return retval;
}
//#endif
#endif

//--------------------------------------------------------------------------------------
// FUNCTION: RC_SHM_BUFFER_START
//--------------------------------------------------------------------------------------
// return (virtual) start address of off-chip shared memory
//--------------------------------------------------------------------------------------
#ifndef __hermit__
#ifndef SCC_COUPLED_SYSTEMS
t_vcharp RC_SHM_BUFFER_START(){
#ifdef SCC
  t_vcharp retval;
  SHMalloc(&retval); //SHMalloc() is in SCC_API.c
  return retval;
#else
  return RC_shm_buffer;
#endif
}
#else
t_vcharp RC_SHM_BUFFER_START(int device){
  t_vcharp retval;
  if (device == RCCE_DEVICE_NR)
    SHMalloc(&retval);
  else
    RMalloc(&retval, device);

  return retval;
}
#endif
#endif

extern int isle_id(void);

//--------------------------------------------------------------------------------------
// FUNCTION: MYCOREID
//--------------------------------------------------------------------------------------
// return physical core ID of calling core
//--------------------------------------------------------------------------------------
int MYCOREID() {
#ifdef __hermit__
  return isle_id();
#elif defined(SCC)
  int tmp, x, y, z;
  tmp=ReadConfigReg(CRB_OWN+MYTILEID);
  x=(tmp>>3) & 0x0f; // bits 06:03
  y=(tmp>>7) & 0x0f; // bits 10:07
  z=(tmp   ) & 0x07; // bits 02:00
#ifndef SCC_COUPLED_SYSTEMS
  return ( ( x + ( 6 * y ) ) * 2 ) + z; // True Processor ID!
#else
   return ( ( x + ( 6 * y ) ) * 2 ) + z + RCCE_MAXNP_PER_BOARD * RCCE_DEVICE_NR; // True Processor ID!
#endif
#else
  // the COREIDs are read into the main program in potentially random order.
  // Each core can access its own Core ID. We simulate that by selecting
  // the value in the list of coreids that corresponds to the sequence
  // number of the OpenMP thread number                                  
  return RC_COREID[omp_get_thread_num()];
#endif // SCC
}

#if defined(SCC)
//--------------------------------------------------------------------------------------
// FUNCTIONS: Locksuite for test-purpose
//--------------------------------------------------------------------------------------
// acquire lock corresponding to core with rank ID
//--------------------------------------------------------------------------------------
int RCCE_try_lock(int ID) {
  if (Test_and_Set(ID))
    return(RCCE_SUCCESS);
  return(RCCE_PENDING);
}

int RCCE_TNS_barrier(RCCE_COMM* comm) {

// two roundtrips to realize a barrier using a T&S Register for each core.

// 1. search first free T&S Register to spin
// 2. last waiter wakes up first waiter and continues local wait
// 3. first waiter wakes up second waiter by releasing its lock ...
// At least every used T&S Register is 0 and no UE can overtake a barrier.

  int num = comm->size;
  int step = 0;
  //fprintf(stderr,"%d:\t enter barrier \n",id);

  while( !Test_and_Set(step) ) ++step;
  // only one UE runs until T&S # num-1

  //fprintf(stderr,"%d:\t step %d\n",id,step);

  if(step == num-1) {
    //fprintf(stderr,"%d:\t I am the last one\n",id);
    *(virtual_lockaddress[0]) = 0x0;
    while(!Test_and_Set(step));
    *(virtual_lockaddress[step]) = 0x0;
  } else {
    while(!Test_and_Set(step));
    *(virtual_lockaddress[step]) = 0x0;
    *(virtual_lockaddress[step+1]) = 0x0;
  }
  //fprintf(stderr,"released barrier! step: %d\n", step);
  return RCCE_SUCCESS;
}

int RCCE_nb_TNS_barrier(RCCE_COMM* comm) {

// two roundtrips to realize a barrier using a T&S Register for each core.

// 1. search first free T&S Register to spin
// 2. last waiter wakes up first waiter and continues local wait
// 3. first waiter wakes up second waiter by releasing its lock ...
// At least every used T&S Register is 0 and no UE can overtake a barrier.

  int num = comm->size;
  int step = 0;
  //fprintf(stderr,"%d:\t enter barrier \n",id);

  if(comm->label == 1) goto label1;
  if(comm->label == 2) goto label2;

  while( !Test_and_Set(step) ) ++step;
  // only one UE runs until T&S # num-1

  //fprintf(stderr,"%d:\t step %d\n",id,step);

  if(step == num-1) {
    //fprintf(stderr,"%d:\t I am the last one\n",id);
    *(virtual_lockaddress[0]) = 0x0;
    comm->step = step;
  label1:
    step = comm->step;
    if(!Test_and_Set(step))
    {
      comm->label = 1;
      return RCCE_PENDING;
    }
    *(virtual_lockaddress[step]) = 0x0;
  } else {
    comm->step = step;
  label2:
    step = comm->step;
    if(!Test_and_Set(step))
    {
      comm->label = 2;
      return RCCE_PENDING;
    }
    *(virtual_lockaddress[step]) = 0x0;
    *(virtual_lockaddress[step+1]) = 0x0;
  }
  //fprintf(stderr,"released barrier! step: %d\n", step);
  comm->label = 0;
  return RCCE_SUCCESS;
}

#ifdef AIR
RCCE_AIR RCCE_atomic_inc_regs[2*RCCE_MAXNP];

int RCCE_AIR_barrier2(RCCE_COMM *comm)
{
  static int idx = 0;
  unsigned long long time, time1, time2;
  float ran = 0;
  int id, val = 0, val2 = 0;
  int window = comm->size;
  int ue = RCCE_ue();
  int x = X_PID(ue), y = Y_PID(ue);
  int win = 1000000;

  // ++air_counter;
  if (comm == &RCCE_COMM_WORLD) {
    time = RCCE_wtime();
    if ((id = *RCCE_atomic_inc_regs[idx].counter) < (comm->size-1)) 
    {
      if(window > 16) {
        val = id;
        val2 = val;
        time1 = RCCE_wtime();;

        if(window > 26)
        {
          ran = ((y+x)%8)*window*window/24000000.0;
          window = (RCCE_wtime() - time)*win;//(RCCE_wtime() - time)*1000000.0;
        }
        else
          window = 1;
        ran = ran+(rand()%(window))/(win*100.0);
        do
        {
          time = RCCE_wtime() - time;
          time2 = RCCE_wtime()-time1-time/2;
          time1 = RCCE_wtime();
          while(RCCE_wtime()-time1 < (((0.424+ran)*(comm->size-val)*(time2)/(val-val2+1)-time/2)))
          {
            if(RCCE_wtime()-time1>0.0050)
              break;
          }
          val2 = val;
          time = RCCE_wtime();
          // ++air_counter;
        } while ((val = *RCCE_atomic_inc_regs[idx].init) > 0 && (val < comm->size));
      }
      else
      {
        do
        {
          // ++air_counter;
        }
        while ((val = *RCCE_atomic_inc_regs[idx].init) > 0 && (val < comm->size));
      }
      
    }
    else
    {
      *RCCE_atomic_inc_regs[idx].init = 0;	
    }
    idx = !idx;
    return(RCCE_SUCCESS);
  }
  else
  {
    return RCCE_barrier(comm);
  }
}

#ifndef GORY
int RCCE_dissemination_barrier(RCCE_COMM *comm)
{
  int k, max_rounds;
  int ue, num_ues, ue_signal;
  ue = RCCE_ue();
  num_ues = RCCE_num_ues();
  max_rounds = num_ues*(1+(num_ues%2)?1:0);

  for(k = 1; k < max_rounds; k = k*2 )
  {
    /* signalize process */
    ue_signal = (ue+k)%num_ues;
    RCCE_flag_write(&RCCE_barrier_flag[RCCE_IAM], RCCE_FLAG_SET, ue_signal);
    /* wait for process */
    ue_signal = (ue-k+num_ues+num_ues)%num_ues;
    RCCE_wait_until(RCCE_barrier_flag[ue_signal], RCCE_FLAG_SET);
    RCCE_flag_write(&RCCE_barrier_flag[ue_signal], RCCE_FLAG_UNSET, RCCE_IAM);
  }

  return(RCCE_SUCCESS);
}
#endif

int RCCE_tree_init(RCCE_COMM *comm, tree_t *tree, int num_children) {
  int ue, num_ues;
  int i, j, k;
  tree_t nodes[RCCE_MAXNP];
  if(comm != &RCCE_COMM_WORLD)
    return(!RCCE_SUCCESS);
  ue = RCCE_ue();
  num_ues = RCCE_num_ues();

  nodes[0].parent = -1;
  k = 1;

  for(i = 0; i < num_ues; ++i)
  {
    nodes[i].num_children = 0;
    for(j = 0; j < num_children && k < num_ues; ++j, ++k)
    {
      nodes[i].child[j] = k;
      nodes[k].parent = i;
      ++(nodes[i].num_children);
    }
  }
  memcpy(tree, &nodes[RCCE_IAM], sizeof(tree_t));

  // printf("%d: child0:%d child1:%d parent:%d\n", ue, tree->child[0], tree->child[1], tree->parent);fflush(0);

  return(RCCE_SUCCESS);
}

#ifndef GORY
int RCCE_tree_barrier(RCCE_COMM *comm, tree_t *tree)
{
  int i;
  /* Gather */
  for(i = 0; i < tree->num_children; ++i)
  {
    RCCE_wait_until(RCCE_barrier_flag[tree->child[i]], RCCE_FLAG_SET);
    RCCE_flag_write(&RCCE_barrier_flag[tree->child[i]], RCCE_FLAG_UNSET, RCCE_IAM);
  }

  if(tree->parent != -1)
  {
    RCCE_flag_write(&RCCE_barrier_flag[RCCE_IAM], RCCE_FLAG_SET, tree->parent);

    /* Release */
    RCCE_wait_until(RCCE_barrier_release_flag, RCCE_FLAG_SET);
    RCCE_flag_write(&RCCE_barrier_release_flag, RCCE_FLAG_UNSET, RCCE_IAM);
  }

  /* Release */
  for(i = 0; i < tree->num_children; ++i)
  {
    RCCE_flag_write(&RCCE_barrier_release_flag, RCCE_FLAG_SET, tree->child[i]);
  }
  
  return(RCCE_SUCCESS);
}
#endif

int RCCE_tournament_barrier(RCCE_COMM *comm)
{
  return(RCCE_SUCCESS);
}

int RCCE_tournament_fixed_barrier(RCCE_COMM *comm)
{
  return(RCCE_SUCCESS);
}

int RCCE_AIR_barrier(RCCE_COMM *comm)
{
  static int idx = 0;
  static unsigned int rand = 0;
  int backoff = BACKOFF_MIN, wait, i = 0;

  if (comm == &RCCE_COMM_WORLD) {
    if (*RCCE_atomic_inc_regs[idx].counter < (comm->size-1)) 
    {
      while (*RCCE_atomic_inc_regs[idx].init > 0)
      {
        rand = rand * 1103515245u + 12345u;
        wait = BACKOFF_MIN + (rand % (backoff << i));
        RC_wait(wait);
        if (wait < BACKOFF_MAX) i++;
      }
    }
    else
    {
      *RCCE_atomic_inc_regs[idx].init = 0;	
    }
    idx = !idx;
    return(RCCE_SUCCESS);
  }
  else
  {
    return RCCE_barrier(comm);
  }
}

int RCCE_nb_AIR_barrier(RCCE_COMM *comm)
{
  static int idx = 0;
  static unsigned int rand = 0;
  int backoff = BACKOFF_MIN, wait, i = 0;

  if(comm->label == 1) goto label1;

  if (comm == &RCCE_COMM_WORLD) {
    if (*RCCE_atomic_inc_regs[idx].counter < (comm->size-1)) 
    {
#if 0 // NO BACKOFF in Non-Blocking case ???
      while (*RCCE_atomic_inc_regs[idx].init > 0)
      {
        rand = rand * 1103515245u + 12345u;
        wait = BACKOFF_MIN + (rand % (backoff << i));
        RC_wait(wait);
        if (wait < BACKOFF_MAX) i++;
      }
#else
    label1:
      if(*RCCE_atomic_inc_regs[idx].init > 0)
      {
	comm->label = 1;
	return RCCE_PENDING;
      }
#endif
    }
    else
    {
      *RCCE_atomic_inc_regs[idx].init = 0;	
    }
    idx = !idx;
    comm->label = 0;
    return(RCCE_SUCCESS);
  }
  else
  {
    return RCCE_barrier(comm);
  }
}
#endif

int RCCE_acquire_treelock(RCCE_COMM* comm) {
  int i = 1; // concurrency factor
  int step;
  int group = (1 << i);
  int me = comm->my_rank;

  //fprintf(stdout,"%d\tstart treelock:\n", me);
  while (1){

    //group <<= 1;
    //if(group > num) break;

    // first rank within group + mid of group (leftmost)
    step = ( me - ( me % group) ) + ( ( group - 1 ) >> 1 ) ;

    //fprintf(stdout,"%d\t%d\n", me, step);
    //fflush(stdout);
    while(!Test_and_Set(comm->member[step]));
    
    if(group >= comm->size) break;
    
    group <<= i;
  }// while ( group <= comm->size);
  // group is next 2^x

  //fprintf(stdout,"\n");
  //fflush(stderr);
  return(RCCE_SUCCESS);
}

int RCCE_release_treelock(RCCE_COMM* comm) {//int myID, int num) { 
  int step;
  int group;
  int v = comm->size;
  int me = comm->my_rank;

  // round up to the next highest power of 2
  v--;
  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;
  v++;
  // 
  group = v;

  //printf(stderr,"%d\trelease treelock: [%d] ",myID,group);

  while(1) {
    step = ( me - ( me % group) ) + ( ( group - 1 ) >> 1 );
    //fprintf(stderr," %d",step);
    *(virtual_lockaddress[(comm->member[step])]) = 0x0;
    group >>= 1;
    if(group < 2) break;
  }
  //fprintf(stderr,"\n");
  //fflush(stderr);
  return(RCCE_SUCCESS);
}

int RCCE_backoff_lock(int ID) {
  //static int next = RC_MY_COREID;
  // try lock with backoff
 
  int i = 0;
 
  int backoff = BACKOFF_MIN, wait = 0, tmp = 0;
  unsigned int overflow = 0;
  

  while (1) {
    if (Test_and_Set(ID))
      break;

    // Kongruenzgenerator
    next = ( next * 1103515245 + 12345 ) % ( INT_MAX );

    wait = BACKOFF_MIN + ( next % ( backoff << i ) );

    overflow += wait;
    if( overflow > INT_MAX ) overflow = INT_MAX;

    RC_wait(wait);
    if ( (backoff<<i) < BACKOFF_MAX) i++;
  }

  tmp = (int)overflow;

# if (LOCKDEBUG)
    return tmp;
# endif
  return(RCCE_SUCCESS);
}
#endif

//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_acquire_lock
//--------------------------------------------------------------------------------------
// acquire lock corresponding to core with rank ID
//--------------------------------------------------------------------------------------
int RCCE_acquire_lock(int ID) {

#ifdef __hermit__
  islelock_lock();
#elif defined(SCC)
  // semantics of test&set register: a read returns zero if another core has
  // previously read it and no reset has occurred since then. Otherwise, the read
  // returns one. Comparing (hex) one with the contents of the register forces a
  // read. As long as the comparison fails, we keep reading.
# if (LOCKDEBUG)
      int tmp = 0;
      while (!Test_and_Set(ID)) ++tmp;
      return tmp;
# else
      while (!Test_and_Set(ID)) ;
# endif
#else
  omp_set_lock(&(RCCE_corelock[ID]));
#endif
  return(RCCE_SUCCESS);
}

//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_release_lock
//--------------------------------------------------------------------------------------
// release lock corresponding to core with rank ID
//--------------------------------------------------------------------------------------
int RCCE_release_lock(int ID) {
#ifdef __hermit__
  islelock_unlock();
#elif defined(SCC)
  // semantics of test&set register: a write by _any_ core causes a reset
  *(virtual_lockaddress[ID]) = 0x0;
#else
  omp_unset_lock(&(RCCE_corelock[ID]));
#endif
  return RCCE_SUCCESS;
}

//--------------------------------------------------------------------------------------
// FUNCTION: RC_FREQUENCY
//--------------------------------------------------------------------------------------
// return actual core clock frequency (Hz)
//--------------------------------------------------------------------------------------
long long RC_FREQUENCY() {
return (long long)(RC_REFCLOCKGHZ*1.e9);
}

//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_init
//--------------------------------------------------------------------------------------
// initialize the library and sanitize parameter list
//--------------------------------------------------------------------------------------
int RCCE_init(
  int *argc,   // pointer to argc, passed in from main program
  char ***argv // pointer to argv, passed in from main program
  ) {
  int ue;
#ifdef SCC
  #ifdef SCC_COUPLED_SYSTEMS
    int board;
  #endif
  #ifndef __hermit__
    int x, y, z;
    unsigned int physical_lockaddress;
  #endif
#endif
#ifdef SHMADD
  int i;
  unsigned int RCCE_SHM_BUFFER_offset ,result, rd_slot_nbr, wr_slot_nbr;
#endif
  void *nothing = NULL;

  int verbose_level = 0;

#ifdef __hermit__
  sys_rcce_init(RCCE_SESSION_ID /* id of the session */);
#elif defined(SCC)
  // Copperridge specific initialization...
  InitAPI(0);fflush(0);
#endif

  // save pointer to executable name for later insertion into the argument list
  char *executable_name = (*argv)[0];

  if(getenv("MPID_SCC_VERBOSITY_LEVEL") != NULL)
  {
    verbose_level = atoi(getenv("MPID_SCC_VERBOSITY_LEVEL"));
  }

#ifdef __hermit__
  RCCE_DEVICE_NR      = 0;
#elif defined(SCC) && defined(SCC_COUPLED_SYSTEMS)
  RCCE_DEVICE_NR      = atoi(*(++(*argv)));  
#else
  RCCE_DEVICE_NR      = 0;
#endif

  RCCE_NP        = atoi(*(++(*argv)));  
#ifdef __hermit__
  // HermitCore ignores the third argument and uses
  // its own clock value
  RC_REFCLOCKGHZ = (double) get_cpufreq() / 1000.0;
  ++(*argv);
#else
  RC_REFCLOCKGHZ = atof(*(++(*argv)));
#endif

  // put the participating core ids (unsorted) into an array             
  for (ue=0; ue<RCCE_NP; ue++) {
    RC_COREID[ue] = atoi(*(++(*argv)));
  }

#ifndef SCC
  // if using the functional emulator, must make sure to have read all command line 
  // parameters up to now before overwriting (shifted) first one with executable
  // name; even though argv is made firstprivate, that applies only the pointer to 
  // the arguments, not the actual data
  #pragma omp barrier
#endif
  // make sure executable name is as expected                 
  (*argv)[0] = executable_name;

  RC_MY_COREID = MYCOREID();

  next = RC_MY_COREID;

  // adjust apparent number of command line arguments, so it will appear to main 
  // program that number of UEs, clock frequency, and core ID list were not on
  // command line        
#ifndef SCC_COUPLED_SYSTEMS
  *argc -= RCCE_NP + 2;
#else
  *argc -= RCCE_NP + 3;
#endif

  if(RCCE_NP == 1) {
    RCCE_IAM = 0;
  }
  else {

    // sort array of participating phyical core IDs to determine their ranks
    RCCE_qsort((char *)RC_COREID, RCCE_NP, sizeof(int), id_compare);
    
    // determine rank of calling core
    for (ue=0; ue<RCCE_NP; ue++) {
      if (RC_COREID[ue] == RC_MY_COREID) RCCE_IAM = ue;
    }
  }

#ifdef SHMADD
//   printf("Using SHMADD\n");
     RCCE_SHM_BUFFER_offset     = 0x00;
//   RCCE_SHM_BUFFER_offset     = 0x3FFFF80;
//   RCCE_SHM_BUFFER_offset   = 0x4000000;
//   RCCE_SHM_BUFFER_offset   = 0x181000;
   rd_slot_nbr=0x80;
   for(i=0; i<60; i++) {
     result  = readLUT(rd_slot_nbr);
     result -= 1;
     wr_slot_nbr = rd_slot_nbr + 4;
     writeLUT(wr_slot_nbr,result);
     rd_slot_nbr++;
   }
#endif

  // leave in one reassuring debug print
  if (DEBUG) {
    printf("My rank is %d, physical core ID is %d\n", RCCE_IAM, RC_MY_COREID);
    fflush(0);
  }

  if (RCCE_IAM<0)
    return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_CORE_NOT_IN_HOSTFILE));

#if defined(SCC)
  // compute and memory map addresses of test&set registers for all participating cores 
  for (ue=0; ue<RCCE_NP; ue++) { 
#ifdef __hermit__
    virtual_lockaddress[ue] = (t_vcharp) ((size_t)rcce_lock + (ue+1) * RCCE_LINE_SIZE);
#else
    z = Z_PID(RC_COREID[ue]);
    x = X_PID(RC_COREID[ue]);
    y = Y_PID(RC_COREID[ue]);
#ifndef SCC_COUPLED_SYSTEMS
    physical_lockaddress = CRB_ADDR(x,y) + (z==0 ? LOCK0 : LOCK1);
#else
    physical_lockaddress = CRB_ADDR(x, y, RC_COREID[ue] / RCCE_MAXNP_PER_BOARD, RCCE_DEVICE_NR) + (z==0 ? LOCK0 : LOCK1);
#endif
    virtual_lockaddress[ue] = (t_vcharp) MallocConfigReg(physical_lockaddress);
#endif
  }
#endif

  // initialize MPB starting addresses for all participating cores; allow one
  // dummy cache line at front of MPB for fooling write combine buffer in case
  // of single-byte MPB access
#ifndef __hermit__
  RCCE_fool_write_combine_buffer = RC_COMM_BUFFER_START(RCCE_IAM);
#endif

  for (ue=0; ue<RCCE_NP; ue++) 
    RCCE_comm_buffer[ue] = RC_COMM_BUFFER_START(ue) + RCCE_LINE_SIZE;

  // gross MPB size is set equal to maximum
  RCCE_BUFF_SIZE = RCCE_BUFF_SIZE_MAX - RCCE_LINE_SIZE;

#ifndef __hermit__
//#ifdef USE_FLAG_EXPERIMENTAL
  for (ue=0; ue<RCCE_NP; ue++) {
    RCCE_flag_buffer[ue] = RC_FLAG_BUFFER_START(ue) + RCCE_LINE_SIZE;
  }
//#endif
#endif

#ifdef RC_POWER_MANAGEMENT
#ifndef SCC
  // always store RPC queue data structure at beginning of MPB, so allocatable
  // storage needs to skip it. Only need to do this for functional emulator
  for (ue=0; ue<RCCE_NP; ue++) {
//#ifdef USE_FLAG_EXPERIMENTAL
    RCCE_flag_buffer[ue] += REGULATOR_LENGTH;
//#endif
    RCCE_comm_buffer[ue] += REGULATOR_LENGTH;
  }
  RCCE_BUFF_SIZE -= REGULATOR_LENGTH;
#endif
#endif

  // initialize RCCE_malloc
  RCCE_malloc_init(RCCE_comm_buffer[RCCE_IAM],RCCE_BUFF_SIZE);

#ifndef __hermit__
#ifdef SHMADD

  RCCE_shmalloc_init(RC_SHM_BUFFER_START()+RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX);
#ifdef SHMDBG
  printf("\n%d:%s:%d: RCCE_SHM_BUFFER_offset, RCCE_SHM_SIZE_MAX: % x %x\n", RCCE_IAM, 
    __FILE__,__LINE__,RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX);
#endif
#else

#ifndef SCC_COUPLED_SYSTEMS
  RCCE_shmalloc_init(RC_SHM_BUFFER_START(),RCCE_SHM_SIZE_MAX);
#else
  for(board=RCCE_MAX_BOARDS-1; board>=0; board--)
    RCCE_shmalloc_init(RC_SHM_BUFFER_START(board),RCCE_SHM_SIZE_MAX/RCCE_MAX_BOARDS);
#endif
#endif
#endif

  // create global communicator (equivalent of MPI_COMM_WORLD); this will also allocate 
  // the two synchronization flags associated with the global barrier 
  RCCE_comm_split(RCCE_global_color, nothing, &RCCE_COMM_WORLD);

  // if power management is enabled, initialize more stuff; this includes two more 
  // communicators (for voltage and frequency domains), plus two synchronization flags
  // associated with the barrier for each communicator       
#ifdef RC_POWER_MANAGEMENT
  int error;
  if (error=RCCE_init_RPC(RC_COREID, RCCE_IAM, RCCE_NP)) 
       return(RCCE_error_return(RCCE_debug_RPC,error));
#endif

#ifndef GORY
  // if we use the simplified API, we need to define more flags upfront  
  for (ue=0; ue<RCCE_NP; ue++) {
    RCCE_flag_alloc(&RCCE_sent_flag[ue]);
    RCCE_flag_alloc(&RCCE_ready_flag[ue]);
#ifdef USE_PIPELINE_FLAGS
    RCCE_flag_alloc(&RCCE_sent_flag_pipe[ue]);
    RCCE_flag_alloc(&RCCE_ready_flag_pipe[ue]);
#endif
#ifdef USE_PROBE_FLAGS
    RCCE_flag_alloc(&RCCE_probe_flag[ue]);
#endif
    RCCE_flag_alloc(&RCCE_barrier_flag[ue]);
  }  
    RCCE_flag_alloc(&RCCE_barrier_release_flag);

#ifndef USE_REMOTE_PUT_LOCAL_GET
  RCCE_send_queue = NULL;
  for (ue=0; ue<RCCE_NP; ue++) {
    RCCE_recv_queue[ue] = NULL;
  }
#else
  RCCE_recv_queue = NULL;
  for (ue=0; ue<RCCE_NP; ue++) {
    RCCE_send_queue[ue] = NULL;
  }
#endif

#endif

#if defined(SCC) && defined(SCC_COUPLED_SYSTEMS)
  int tmp, dev;
  if(RCCE_NP > 1) {
    if(RCCE_IAM != RCCE_NP-1) {
      RCCE_send((char*)&RCCE_DEVICE_NR, sizeof(int), RCCE_IAM+1);  
    }
    if(RCCE_IAM != 0) {
      RCCE_recv((char*)&tmp, sizeof(int), RCCE_IAM-1);
      if(tmp != RCCE_DEVICE_NR) tmp = RCCE_IAM;
      else tmp = -1;
      RCCE_send((char*)&tmp, sizeof(int), 0);
    }
    else
    {
      RCCE_NUM_DEVICES = 0;
      for(ue=1; ue<RCCE_NP; ue++) {
	RCCE_recv((char*)&tmp, sizeof(int), ue);
	if(tmp != -1) {
	  if(RCCE_NUM_DEVICES == 0)
	    RCCE_NUM_UES_DEVICE[0] = tmp;	  
	  else
	    RCCE_NUM_UES_DEVICE[RCCE_NUM_DEVICES] = tmp - RCCE_NUM_UES_DEVICE[RCCE_NUM_DEVICES-1];
	  RCCE_NUM_DEVICES++;
	}
      }
      RCCE_NUM_DEVICES++;
      for(dev=0, tmp=0; dev<RCCE_NUM_DEVICES; dev++)
	tmp += RCCE_NUM_UES_DEVICE[dev];
      RCCE_NUM_UES_DEVICE[RCCE_NUM_DEVICES-1] = RCCE_NP - tmp;
    }
    RCCE_bcast((char*)&RCCE_NUM_DEVICES, sizeof(int), 0, RCCE_COMM_WORLD);
    RCCE_bcast((char*)&RCCE_NUM_UES_DEVICE, RCCE_MAX_BOARDS * sizeof(int), 0, RCCE_COMM_WORLD);

    for(ue=0; ue<RCCE_NP; ue++) {
      for(dev=0, tmp=0; dev<RCCE_NUM_DEVICES; dev++)
      {
	if(ue == RCCE_IAM) RCCE_DEVICE_LOCAL_UE = RCCE_IAM - tmp;
	tmp += RCCE_NUM_UES_DEVICE[dev];
	if(ue < tmp){
	  RCCE_UE_TO_DEVICE[ue] = dev;
	  //printf("(%d) RCCE_UE_TO_DEVICE[%d] = %d\n", RCCE_IAM, ue, dev);	  
	  break;
	}
      }
    }
    //printf("(%d) RCCE_DEVICE_LOCAL_UE = %d\n", RCCE_IAM, RCCE_DEVICE_LOCAL_UE);
  }
  else
#endif
  {
    RCCE_NUM_DEVICES = 1;
    RCCE_NUM_UES_DEVICE[0] = RCCE_NP;
    RCCE_DEVICE_LOCAL_UE = RCCE_IAM;    
    for(ue=0; ue<RCCE_NP; ue++) RCCE_UE_TO_DEVICE[ue] = 0;
  }

#ifdef AIR
  {
    int * air_base = (int *) MallocConfigReg(FPGA_BASE + 0xE000);

    // Assign and Initialize First Set of Atomic Increment Registers
    for (i = 0; i < RCCE_MAXNP; i++)
    {
      RCCE_atomic_inc_regs[i].counter = air_base + 2*i;
      RCCE_atomic_inc_regs[i].init = air_base + 2*i + 1;
      if(RCCE_IAM == 0)
	*RCCE_atomic_inc_regs[i].init = 0;
    }
    // Assign and Initialize Second Set of Atomic Increment Registers
    air_base = (int *) MallocConfigReg(FPGA_BASE + 0xF000);
    for (i = 0; i < RCCE_MAXNP; i++) 
    {
      RCCE_atomic_inc_regs[RCCE_MAXNP+i].counter = air_base + 2*i;
      RCCE_atomic_inc_regs[RCCE_MAXNP+i].init = air_base + 2*i + 1;
      if(RCCE_IAM == 0)
	*RCCE_atomic_inc_regs[RCCE_MAXNP+i].init = 0;
    }
  }
#endif

#ifndef GORY
  if( (RCCE_IAM == 0) && (verbose_level > 1) )
  {
    printf("### %s: Remaining MPB space for communication: %zd Bytes per core\n", executable_name, RCCE_chunk); fflush(stdout);
  }
#endif

  RCCE_barrier(&RCCE_COMM_WORLD);

  return (RCCE_SUCCESS);
}

//--------------------------------------------------------------------------------------
// FUNCTION:  RCCE_finalize
//--------------------------------------------------------------------------------------
// clean up at end of library usage (memory unmapping) and resetting of memory and
// registers
//--------------------------------------------------------------------------------------
int RCCE_finalize(void){

#ifdef SCC
#ifndef __hermit__
  int ue, iword;
#endif

  RCCE_barrier(&RCCE_COMM_WORLD);

  // each UE clears its own MPB and test&set register
  //ERROR: THIS IS NOT THE START OF THE COMM BUFFER, BUT OF THE PAYLOAD AREA!!
//  for (iword=0; iword<(RCCE_BUFF_SIZE_MAX)/sizeof(int); iword++)
//      ((int *)(RCCE_comm_buffer[ue]))[iword] = 0;
//    MPBunalloc(&(RCCE_comm_buffer[ue]));
#ifndef __hermit__
  RCCE_release_lock(RCCE_IAM);
  // each core needs to unmap all special memory locations
  for (ue=0; ue<RCCE_NP; ue++) { 
    FreeConfigReg((int *)(virtual_lockaddress[ue]));
  }
#else
  sys_rcce_fini(RCCE_SESSION_ID /* id of the session */);
#endif
  fflush(NULL);
#endif
  return (RCCE_SUCCESS);
}

//--------------------------------------------------------------------------------------
// FUNCTION:  RCCE_wtime
//--------------------------------------------------------------------------------------
// clean up at end of library usage (memory unmapping)
//--------------------------------------------------------------------------------------
double RCCE_wtime(void) {
#ifdef SCC
  return ( ((double)_rdtsc())/(RC_REFCLOCKGHZ*1.e9));
#else
  return (omp_get_wtime());
#endif
}

//--------------------------------------------------------------------------------------
// FUNCTION:  RCCE_ue
//--------------------------------------------------------------------------------------
// return rank of calling core
//--------------------------------------------------------------------------------------
int RCCE_ue(void) {return(RCCE_IAM);}

//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_num_ues
//--------------------------------------------------------------------------------------
// return total number of participating UEs              
//--------------------------------------------------------------------------------------
int RCCE_num_ues(void) {return(RCCE_NP);}

#ifdef SCC_COUPLED_SYSTEMS
//--------------------------------------------------------------------------------------
// FUNCTIONS:  RCCE_dev, RCCE_num_devs, RCCE_num_ues_dev
//--------------------------------------------------------------------------------------
// returning ID of own device, total number of devices and number of UEs per device
//--------------------------------------------------------------------------------------
int RCCE_dev(void) {return(RCCE_DEVICE_NR);}
int RCCE_num_dev(void) {return(RCCE_NUM_DEVICES);}
int RCCE_num_ues_dev(int ue) {return(RCCE_NUM_UES_DEVICE[ue]);}
int RCCE_ue_to_dev(int ue) { return(RCCE_UE_TO_DEVICE[ue]);}
int RCCE_dev_ue(void) { return(RCCE_DEVICE_LOCAL_UE);}
#endif

#ifdef SHMADD
//--------------------------------------------------------------------------------------
// FUNCTION: writeLUT
//--------------------------------------------------------------------------------------
void writeLUT(unsigned int lutSlot, unsigned int value) {

int PAGE_SIZE, NCMDeviceFD;
// NCMDeviceFD is the file descriptor for non-cacheable memory (e.g. config regs).

unsigned int result;

t_vcharp     MappedAddr;
unsigned int myCoreID, alignedAddr, pageOffset, ConfigAddr;

   myCoreID = getCOREID();
   if(myCoreID==1)
      ConfigAddr = CRB_OWN+LUT1 + (lutSlot*0x08);
   else
      ConfigAddr = CRB_OWN+LUT0 + (lutSlot*0x08);

   PAGE_SIZE  = getpagesize();

   if ((NCMDeviceFD=open("/dev/rckncm", O_RDWR|O_SYNC))<0) {
    perror("open"); exit(-1);
   }

   alignedAddr = ConfigAddr & (~(PAGE_SIZE-1));
   pageOffset  = ConfigAddr - alignedAddr;

   MappedAddr = (t_vcharp) mmap(NULL, PAGE_SIZE, PROT_WRITE|PROT_READ,
       MAP_SHARED, NCMDeviceFD, alignedAddr);

   if (MappedAddr == MAP_FAILED) {
      perror("mmap");exit(-1);
   }

   *(int*)(MappedAddr+pageOffset) = value;
   munmap((void*)MappedAddr, PAGE_SIZE);

}

//--------------------------------------------------------------------------------------
// FUNCTION: readLUT
//--------------------------------------------------------------------------------------
unsigned int readLUT(unsigned int lutSlot) {

int PAGE_SIZE, NCMDeviceFD;
// NCMDeviceFD is the file descriptor for non-cacheable memory (e.g. config regs).

unsigned int result;
t_vcharp     MappedAddr;
unsigned int myCoreID, alignedAddr, pageOffset, ConfigAddr;

   myCoreID = getCOREID();
   if(myCoreID==1)
      ConfigAddr = CRB_OWN+LUT1 + (lutSlot*0x08);
   else
      ConfigAddr = CRB_OWN+LUT0 + (lutSlot*0x08);

   PAGE_SIZE  = getpagesize();

   if ((NCMDeviceFD=open("/dev/rckncm", O_RDWR|O_SYNC))<0) {
    perror("open"); exit(-1);
   }

   alignedAddr = ConfigAddr & (~(PAGE_SIZE-1));
   pageOffset  = ConfigAddr - alignedAddr;

   MappedAddr = (t_vcharp) mmap(NULL, PAGE_SIZE, PROT_WRITE|PROT_READ,
      MAP_SHARED, NCMDeviceFD, alignedAddr);

   if (MappedAddr == MAP_FAILED) {
      perror("mmap");exit(-1);
   }

  result = *(unsigned int*)(MappedAddr+pageOffset);
  munmap((void*)MappedAddr, PAGE_SIZE);

  return result;
}


//--------------------------------------------------------------------------------------
// FUNCTION: getCOREID
//--------------------------------------------------------------------------------------
unsigned int getCOREID() {

int PAGE_SIZE, NCMDeviceFD;
// NCMDeviceFD is the file descriptor for non-cacheable memory (e.g. config regs).

t_vcharp     MappedAddr;
unsigned int coreID,result,  alignedAddr, pageOffset, ConfigAddr, coreID_mask=0x00000007;


   ConfigAddr = CRB_OWN+MYTILEID;
   PAGE_SIZE  = getpagesize();

   if ((NCMDeviceFD=open("/dev/rckncm", O_RDWR|O_SYNC))<0) {
    perror("open"); exit(-1);
   }

   alignedAddr = ConfigAddr & (~(PAGE_SIZE-1));
   pageOffset  = ConfigAddr - alignedAddr;

   MappedAddr = (t_vcharp) mmap(NULL, PAGE_SIZE, PROT_WRITE|PROT_READ,
      MAP_SHARED, NCMDeviceFD, alignedAddr);

   if (MappedAddr == MAP_FAILED) {
      perror("mmap");exit(-1);
   }

  result = *(unsigned int*)(MappedAddr+pageOffset);
  munmap((void*)MappedAddr, PAGE_SIZE);

  coreID =  result & coreID_mask;
  return coreID;
}
#endif