//*************************************************************************************** // Administrative routines. //*************************************************************************************** // // Author: Rob F. Van der Wijngaart // Intel Corporation // Date: 008/30/2010 // //*************************************************************************************** // // // Copyright 2010 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "RCCE_lib.h" #ifdef RC_POWER_MANAGEMENT #include "RCCE_lib_pwr.h" #endif #ifdef COPPERRIDGE #ifndef SCC #define SCC #endif #endif #ifdef SCC #include #include #include #include #ifndef __hermit__ #include #include "SCC_API.h" #else #define RCCE_SESSION_ID 42 #include "syscall.h" extern unsigned int get_cpufreq(); #endif #endif #include #include #include // En-/ or disable debug prints... #define DEBUG 1 #define LOCKDEBUG 1 #undef SHMDBG #ifdef __hermit__ static inline int tas(t_vcharp lock) { register unsigned char _res = 1; asm volatile( "lock; xchgb %0,%1" : "=q"(_res), "=m"(*lock) : "0"(_res)); return (int) _res; } #define Test_and_Set(a) tas(virtual_lockaddress[a]) #elif defined(SCC) // Test and Set method #define Test_and_Set(a) ((*(virtual_lockaddress[a])) & 0x01) #endif #define BACKOFF_MIN 8 #define BACKOFF_MAX 256 #ifdef __hermit__ typedef struct islelock { // Internal queue int32_t queue; // Internal dequeue int32_t dequeue; } islelock_t; extern islelock_t* rcce_lock; /* * * Use a own implementation of "atomic_add_return" to gurantee * * that the lock prefix is used. * */ inline static int _hermit_atomic_add(int32_t *d, int i) { int res = i; asm volatile("lock; xaddl %0, %1" : "=r"(i) : "m"(*d), "0"(i) : "memory", "cc"); return res+i; } static inline int islelock_lock(void) { int ticket; ticket = _hermit_atomic_add(&rcce_lock->queue, 1); while(rcce_lock->dequeue != ticket) { asm volatile ("pause"); } return 0; } static inline int islelock_unlock(void) { _hermit_atomic_add(&rcce_lock->dequeue, 1); return 0; } #endif //...................................................................................... // GLOBAL VARIABLES USED BY THE LIBRARY //...................................................................................... unsigned int next; int RCCE_NP; // number of participating cores int RCCE_DEVICE_NR; // device number of the scc board int RCCE_NUM_DEVICES; // total number of scc boards involved int RCCE_NUM_UES_DEVICE[RCCE_MAX_BOARDS]; // number of participating cores per board int RCCE_UE_TO_DEVICE[RCCE_MAXNP]; // device id of each core int RCCE_DEVICE_LOCAL_UE; // device-local core id double RC_REFCLOCKGHZ; // baseline CPU frequency (GHz) int RC_MY_COREID; // physical ID of calling core int RC_COREID[RCCE_MAXNP]; // array of physical core IDs for all participating // cores, sorted by rank int RCCE_IAM=-1; // rank of calling core (invalid by default) RCCE_COMM RCCE_COMM_WORLD; // predefined global communicator int RCCE_BUFF_SIZE; // available MPB size t_vcharp RCCE_comm_buffer[RCCE_MAXNP]; // starts of MPB, sorted by rank #ifndef __hermit__ //#ifdef USE_FLAG_EXPERIMENTAL t_vcharp RCCE_flag_buffer[RCCE_MAXNP]; //#endif #endif #ifndef GORY // ......................... non-GORY communication mode ............................. // synchronization flags are predefined and maintained by the library RCCE_FLAG RCCE_sent_flag[RCCE_MAXNP], RCCE_ready_flag[RCCE_MAXNP]; #ifdef USE_PIPELINE_FLAGS RCCE_FLAG RCCE_sent_flag_pipe[RCCE_MAXNP], RCCE_ready_flag_pipe[RCCE_MAXNP]; #endif #ifdef USE_PROBE_FLAGS RCCE_FLAG RCCE_probe_flag[RCCE_MAXNP]; #endif RCCE_FLAG RCCE_barrier_flag[RCCE_MAXNP]; RCCE_FLAG RCCE_barrier_release_flag; // payload part of the MPBs starts at a specific address, not malloced space t_vcharp RCCE_buff_ptr; // maximum chunk size of message payload is also specified size_t RCCE_chunk; // synchronization flags will be allocated at this address t_vcharp RCCE_flags_start; #ifndef USE_REMOTE_PUT_LOCAL_GET // send request queue RCCE_SEND_REQUEST* RCCE_send_queue; // recv request queue RCCE_RECV_REQUEST* RCCE_recv_queue[RCCE_MAXNP]; #else // send request queue RCCE_SEND_REQUEST* RCCE_send_queue[RCCE_MAXNP]; // recv request queue RCCE_RECV_REQUEST* RCCE_recv_queue; #endif #endif // !GORY #ifndef __hermit__ t_vcharp RCCE_fool_write_combine_buffer; #endif // int air_counter = 0; #ifdef SCC // virtual addresses of test&set registers t_vcharp virtual_lockaddress[RCCE_MAXNP]; #endif //...................................................................................... // END GLOBAL VARIABLES USED BY THE LIBRARY //...................................................................................... #ifdef SCC #ifdef __hermit__ inline volatile uint64_t _rdtsc() { register uint64_t lo, hi; asm volatile ("rdtsc" : "=a"(lo), "=d"(hi) ); return ((uint64_t)hi << 32ULL | (uint64_t)lo); } #elif defined(__INTEL_COMPILER) inline volatile long long _rdtsc() { register long long TSC __asm__("eax"); __asm__ volatile (".byte 15, 49" : : : "eax", "edx"); return TSC; } #endif #endif //-------------------------------------------------------------------------------------- // FUNCTION: RC_cache_invalidate //-------------------------------------------------------------------------------------- // invalidate (not flush!) lines in L1 that map to MPB lines //-------------------------------------------------------------------------------------- #ifndef __hermit__ void RC_cache_invalidate() { #ifdef SCC __asm__ volatile ( ".byte 0x0f; .byte 0x0a;\n" ); // CL1FLUSHMB #endif return; } #endif static inline void RC_wait(int wait) { #ifdef __hermit__ asm volatile( "movq %%rax, %%rcx\n\t" "L1: nop\n\t" "loop L1" : /* no output registers */ : "a" (wait) : "%rcx" ); #else asm volatile( "movl %%eax,%%ecx\n\t" "L1: nop\n\t" "loop L1" : /* no output registers */ : "a" (wait) : "%ecx" ); return; #endif } //-------------------------------------------------------------------------------------- // FUNCTION: RC_COMM_BUFFER_SIZE //-------------------------------------------------------------------------------------- // return total available MPB size on chip //-------------------------------------------------------------------------------------- int RC_COMM_BUFFER_SIZE() { return RCCE_BUFF_SIZE_MAX*RCCE_MAXNP; } //-------------------------------------------------------------------------------------- // FUNCTION: RC_COMM_BUFFER_START //-------------------------------------------------------------------------------------- // return (virtual) start address of MPB for UE with rank ue //-------------------------------------------------------------------------------------- t_vcharp RC_COMM_BUFFER_START(int ue){ #ifdef __hermit__ t_vcharp retval; retval = (t_vcharp) sys_rcce_malloc(RCCE_SESSION_ID, RC_COREID[ue]); if (!retval) { fprintf(stderr, "rcce_malloc failed\n"); RCCE_finalize(); exit(1); } return retval; #elif defined(SCC) // "Allocate" MPB, using memory mapping of physical addresses t_vcharp retval; #ifndef SCC_COUPLED_SYSTEMS MPBalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]), (X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) && (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM])) ); #else MPBalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]), RC_COREID[ue] / RCCE_MAXNP_PER_BOARD, RCCE_DEVICE_NR, (X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) && (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM])) ); #endif return retval; #else // even in functional emulation mode we leave gaps in the global MPB return RC_comm_buffer + RC_COREID[ue]*RC_COMM_BUFFER_SIZE()/RCCE_MAXNP; #endif } #ifndef __hermit__ //#ifdef USE_FLAG_EXPERIMENTAL t_vcharp RC_FLAG_BUFFER_START(int ue){ // "Allocate" MPB, using memory mapping of physical addresses t_vcharp retval; #if SCC_COUPLED_SYSTEMS FLAGalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]), RC_COREID[ue] / RCCE_MAXNP_PER_BOARD, RCCE_DEVICE_NR, (X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) && (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM])) ); #else FLAGalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]),(X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) && (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM])) ); #endif return retval; } //#endif #endif //-------------------------------------------------------------------------------------- // FUNCTION: RC_SHM_BUFFER_START //-------------------------------------------------------------------------------------- // return (virtual) start address of off-chip shared memory //-------------------------------------------------------------------------------------- #ifndef __hermit__ #ifndef SCC_COUPLED_SYSTEMS t_vcharp RC_SHM_BUFFER_START(){ #ifdef SCC t_vcharp retval; SHMalloc(&retval); //SHMalloc() is in SCC_API.c return retval; #else return RC_shm_buffer; #endif } #else t_vcharp RC_SHM_BUFFER_START(int device){ t_vcharp retval; if (device == RCCE_DEVICE_NR) SHMalloc(&retval); else RMalloc(&retval, device); return retval; } #endif #endif extern int isle_id(void); //-------------------------------------------------------------------------------------- // FUNCTION: MYCOREID //-------------------------------------------------------------------------------------- // return physical core ID of calling core //-------------------------------------------------------------------------------------- int MYCOREID() { #ifdef __hermit__ return isle_id(); #elif defined(SCC) int tmp, x, y, z; tmp=ReadConfigReg(CRB_OWN+MYTILEID); x=(tmp>>3) & 0x0f; // bits 06:03 y=(tmp>>7) & 0x0f; // bits 10:07 z=(tmp ) & 0x07; // bits 02:00 #ifndef SCC_COUPLED_SYSTEMS return ( ( x + ( 6 * y ) ) * 2 ) + z; // True Processor ID! #else return ( ( x + ( 6 * y ) ) * 2 ) + z + RCCE_MAXNP_PER_BOARD * RCCE_DEVICE_NR; // True Processor ID! #endif #else // the COREIDs are read into the main program in potentially random order. // Each core can access its own Core ID. We simulate that by selecting // the value in the list of coreids that corresponds to the sequence // number of the OpenMP thread number return RC_COREID[omp_get_thread_num()]; #endif // SCC } #if defined(SCC) //-------------------------------------------------------------------------------------- // FUNCTIONS: Locksuite for test-purpose //-------------------------------------------------------------------------------------- // acquire lock corresponding to core with rank ID //-------------------------------------------------------------------------------------- int RCCE_try_lock(int ID) { if (Test_and_Set(ID)) return(RCCE_SUCCESS); return(RCCE_PENDING); } int RCCE_TNS_barrier(RCCE_COMM* comm) { // two roundtrips to realize a barrier using a T&S Register for each core. // 1. search first free T&S Register to spin // 2. last waiter wakes up first waiter and continues local wait // 3. first waiter wakes up second waiter by releasing its lock ... // At least every used T&S Register is 0 and no UE can overtake a barrier. int num = comm->size; int step = 0; //fprintf(stderr,"%d:\t enter barrier \n",id); while( !Test_and_Set(step) ) ++step; // only one UE runs until T&S # num-1 //fprintf(stderr,"%d:\t step %d\n",id,step); if(step == num-1) { //fprintf(stderr,"%d:\t I am the last one\n",id); *(virtual_lockaddress[0]) = 0x0; while(!Test_and_Set(step)); *(virtual_lockaddress[step]) = 0x0; } else { while(!Test_and_Set(step)); *(virtual_lockaddress[step]) = 0x0; *(virtual_lockaddress[step+1]) = 0x0; } //fprintf(stderr,"released barrier! step: %d\n", step); return RCCE_SUCCESS; } int RCCE_nb_TNS_barrier(RCCE_COMM* comm) { // two roundtrips to realize a barrier using a T&S Register for each core. // 1. search first free T&S Register to spin // 2. last waiter wakes up first waiter and continues local wait // 3. first waiter wakes up second waiter by releasing its lock ... // At least every used T&S Register is 0 and no UE can overtake a barrier. int num = comm->size; int step = 0; //fprintf(stderr,"%d:\t enter barrier \n",id); if(comm->label == 1) goto label1; if(comm->label == 2) goto label2; while( !Test_and_Set(step) ) ++step; // only one UE runs until T&S # num-1 //fprintf(stderr,"%d:\t step %d\n",id,step); if(step == num-1) { //fprintf(stderr,"%d:\t I am the last one\n",id); *(virtual_lockaddress[0]) = 0x0; comm->step = step; label1: step = comm->step; if(!Test_and_Set(step)) { comm->label = 1; return RCCE_PENDING; } *(virtual_lockaddress[step]) = 0x0; } else { comm->step = step; label2: step = comm->step; if(!Test_and_Set(step)) { comm->label = 2; return RCCE_PENDING; } *(virtual_lockaddress[step]) = 0x0; *(virtual_lockaddress[step+1]) = 0x0; } //fprintf(stderr,"released barrier! step: %d\n", step); comm->label = 0; return RCCE_SUCCESS; } #ifdef AIR RCCE_AIR RCCE_atomic_inc_regs[2*RCCE_MAXNP]; int RCCE_AIR_barrier2(RCCE_COMM *comm) { static int idx = 0; unsigned long long time, time1, time2; float ran = 0; int id, val = 0, val2 = 0; int window = comm->size; int ue = RCCE_ue(); int x = X_PID(ue), y = Y_PID(ue); int win = 1000000; // ++air_counter; if (comm == &RCCE_COMM_WORLD) { time = RCCE_wtime(); if ((id = *RCCE_atomic_inc_regs[idx].counter) < (comm->size-1)) { if(window > 16) { val = id; val2 = val; time1 = RCCE_wtime();; if(window > 26) { ran = ((y+x)%8)*window*window/24000000.0; window = (RCCE_wtime() - time)*win;//(RCCE_wtime() - time)*1000000.0; } else window = 1; ran = ran+(rand()%(window))/(win*100.0); do { time = RCCE_wtime() - time; time2 = RCCE_wtime()-time1-time/2; time1 = RCCE_wtime(); while(RCCE_wtime()-time1 < (((0.424+ran)*(comm->size-val)*(time2)/(val-val2+1)-time/2))) { if(RCCE_wtime()-time1>0.0050) break; } val2 = val; time = RCCE_wtime(); // ++air_counter; } while ((val = *RCCE_atomic_inc_regs[idx].init) > 0 && (val < comm->size)); } else { do { // ++air_counter; } while ((val = *RCCE_atomic_inc_regs[idx].init) > 0 && (val < comm->size)); } } else { *RCCE_atomic_inc_regs[idx].init = 0; } idx = !idx; return(RCCE_SUCCESS); } else { return RCCE_barrier(comm); } } #ifndef GORY int RCCE_dissemination_barrier(RCCE_COMM *comm) { int k, max_rounds; int ue, num_ues, ue_signal; ue = RCCE_ue(); num_ues = RCCE_num_ues(); max_rounds = num_ues*(1+(num_ues%2)?1:0); for(k = 1; k < max_rounds; k = k*2 ) { /* signalize process */ ue_signal = (ue+k)%num_ues; RCCE_flag_write(&RCCE_barrier_flag[RCCE_IAM], RCCE_FLAG_SET, ue_signal); /* wait for process */ ue_signal = (ue-k+num_ues+num_ues)%num_ues; RCCE_wait_until(RCCE_barrier_flag[ue_signal], RCCE_FLAG_SET); RCCE_flag_write(&RCCE_barrier_flag[ue_signal], RCCE_FLAG_UNSET, RCCE_IAM); } return(RCCE_SUCCESS); } #endif int RCCE_tree_init(RCCE_COMM *comm, tree_t *tree, int num_children) { int ue, num_ues; int i, j, k; tree_t nodes[RCCE_MAXNP]; if(comm != &RCCE_COMM_WORLD) return(!RCCE_SUCCESS); ue = RCCE_ue(); num_ues = RCCE_num_ues(); nodes[0].parent = -1; k = 1; for(i = 0; i < num_ues; ++i) { nodes[i].num_children = 0; for(j = 0; j < num_children && k < num_ues; ++j, ++k) { nodes[i].child[j] = k; nodes[k].parent = i; ++(nodes[i].num_children); } } memcpy(tree, &nodes[RCCE_IAM], sizeof(tree_t)); // printf("%d: child0:%d child1:%d parent:%d\n", ue, tree->child[0], tree->child[1], tree->parent);fflush(0); return(RCCE_SUCCESS); } #ifndef GORY int RCCE_tree_barrier(RCCE_COMM *comm, tree_t *tree) { int i; /* Gather */ for(i = 0; i < tree->num_children; ++i) { RCCE_wait_until(RCCE_barrier_flag[tree->child[i]], RCCE_FLAG_SET); RCCE_flag_write(&RCCE_barrier_flag[tree->child[i]], RCCE_FLAG_UNSET, RCCE_IAM); } if(tree->parent != -1) { RCCE_flag_write(&RCCE_barrier_flag[RCCE_IAM], RCCE_FLAG_SET, tree->parent); /* Release */ RCCE_wait_until(RCCE_barrier_release_flag, RCCE_FLAG_SET); RCCE_flag_write(&RCCE_barrier_release_flag, RCCE_FLAG_UNSET, RCCE_IAM); } /* Release */ for(i = 0; i < tree->num_children; ++i) { RCCE_flag_write(&RCCE_barrier_release_flag, RCCE_FLAG_SET, tree->child[i]); } return(RCCE_SUCCESS); } #endif int RCCE_tournament_barrier(RCCE_COMM *comm) { return(RCCE_SUCCESS); } int RCCE_tournament_fixed_barrier(RCCE_COMM *comm) { return(RCCE_SUCCESS); } int RCCE_AIR_barrier(RCCE_COMM *comm) { static int idx = 0; static unsigned int rand = 0; int backoff = BACKOFF_MIN, wait, i = 0; if (comm == &RCCE_COMM_WORLD) { if (*RCCE_atomic_inc_regs[idx].counter < (comm->size-1)) { while (*RCCE_atomic_inc_regs[idx].init > 0) { rand = rand * 1103515245u + 12345u; wait = BACKOFF_MIN + (rand % (backoff << i)); RC_wait(wait); if (wait < BACKOFF_MAX) i++; } } else { *RCCE_atomic_inc_regs[idx].init = 0; } idx = !idx; return(RCCE_SUCCESS); } else { return RCCE_barrier(comm); } } int RCCE_nb_AIR_barrier(RCCE_COMM *comm) { static int idx = 0; static unsigned int rand = 0; int backoff = BACKOFF_MIN, wait, i = 0; if(comm->label == 1) goto label1; if (comm == &RCCE_COMM_WORLD) { if (*RCCE_atomic_inc_regs[idx].counter < (comm->size-1)) { #if 0 // NO BACKOFF in Non-Blocking case ??? while (*RCCE_atomic_inc_regs[idx].init > 0) { rand = rand * 1103515245u + 12345u; wait = BACKOFF_MIN + (rand % (backoff << i)); RC_wait(wait); if (wait < BACKOFF_MAX) i++; } #else label1: if(*RCCE_atomic_inc_regs[idx].init > 0) { comm->label = 1; return RCCE_PENDING; } #endif } else { *RCCE_atomic_inc_regs[idx].init = 0; } idx = !idx; comm->label = 0; return(RCCE_SUCCESS); } else { return RCCE_barrier(comm); } } #endif int RCCE_acquire_treelock(RCCE_COMM* comm) { int i = 1; // concurrency factor int step; int group = (1 << i); int me = comm->my_rank; //fprintf(stdout,"%d\tstart treelock:\n", me); while (1){ //group <<= 1; //if(group > num) break; // first rank within group + mid of group (leftmost) step = ( me - ( me % group) ) + ( ( group - 1 ) >> 1 ) ; //fprintf(stdout,"%d\t%d\n", me, step); //fflush(stdout); while(!Test_and_Set(comm->member[step])); if(group >= comm->size) break; group <<= i; }// while ( group <= comm->size); // group is next 2^x //fprintf(stdout,"\n"); //fflush(stderr); return(RCCE_SUCCESS); } int RCCE_release_treelock(RCCE_COMM* comm) {//int myID, int num) { int step; int group; int v = comm->size; int me = comm->my_rank; // round up to the next highest power of 2 v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; // group = v; //printf(stderr,"%d\trelease treelock: [%d] ",myID,group); while(1) { step = ( me - ( me % group) ) + ( ( group - 1 ) >> 1 ); //fprintf(stderr," %d",step); *(virtual_lockaddress[(comm->member[step])]) = 0x0; group >>= 1; if(group < 2) break; } //fprintf(stderr,"\n"); //fflush(stderr); return(RCCE_SUCCESS); } int RCCE_backoff_lock(int ID) { //static int next = RC_MY_COREID; // try lock with backoff int i = 0; int backoff = BACKOFF_MIN, wait = 0, tmp = 0; unsigned int overflow = 0; while (1) { if (Test_and_Set(ID)) break; // Kongruenzgenerator next = ( next * 1103515245 + 12345 ) % ( INT_MAX ); wait = BACKOFF_MIN + ( next % ( backoff << i ) ); overflow += wait; if( overflow > INT_MAX ) overflow = INT_MAX; RC_wait(wait); if ( (backoff<=0; board--) RCCE_shmalloc_init(RC_SHM_BUFFER_START(board),RCCE_SHM_SIZE_MAX/RCCE_MAX_BOARDS); #endif #endif #endif // create global communicator (equivalent of MPI_COMM_WORLD); this will also allocate // the two synchronization flags associated with the global barrier RCCE_comm_split(RCCE_global_color, nothing, &RCCE_COMM_WORLD); // if power management is enabled, initialize more stuff; this includes two more // communicators (for voltage and frequency domains), plus two synchronization flags // associated with the barrier for each communicator #ifdef RC_POWER_MANAGEMENT int error; if (error=RCCE_init_RPC(RC_COREID, RCCE_IAM, RCCE_NP)) return(RCCE_error_return(RCCE_debug_RPC,error)); #endif #ifndef GORY // if we use the simplified API, we need to define more flags upfront for (ue=0; ue 1) { if(RCCE_IAM != RCCE_NP-1) { RCCE_send((char*)&RCCE_DEVICE_NR, sizeof(int), RCCE_IAM+1); } if(RCCE_IAM != 0) { RCCE_recv((char*)&tmp, sizeof(int), RCCE_IAM-1); if(tmp != RCCE_DEVICE_NR) tmp = RCCE_IAM; else tmp = -1; RCCE_send((char*)&tmp, sizeof(int), 0); } else { RCCE_NUM_DEVICES = 0; for(ue=1; ue 1) ) { printf("### %s: Remaining MPB space for communication: %zd Bytes per core\n", executable_name, RCCE_chunk); fflush(stdout); } #endif RCCE_barrier(&RCCE_COMM_WORLD); return (RCCE_SUCCESS); } //-------------------------------------------------------------------------------------- // FUNCTION: RCCE_finalize //-------------------------------------------------------------------------------------- // clean up at end of library usage (memory unmapping) and resetting of memory and // registers //-------------------------------------------------------------------------------------- int RCCE_finalize(void){ #ifdef SCC #ifndef __hermit__ int ue, iword; #endif RCCE_barrier(&RCCE_COMM_WORLD); // each UE clears its own MPB and test&set register //ERROR: THIS IS NOT THE START OF THE COMM BUFFER, BUT OF THE PAYLOAD AREA!! // for (iword=0; iword<(RCCE_BUFF_SIZE_MAX)/sizeof(int); iword++) // ((int *)(RCCE_comm_buffer[ue]))[iword] = 0; // MPBunalloc(&(RCCE_comm_buffer[ue])); #ifndef __hermit__ RCCE_release_lock(RCCE_IAM); // each core needs to unmap all special memory locations for (ue=0; ue