//*************************************************************************************** // Administrative routines. //*************************************************************************************** // // Author: Rob F. Van der Wijngaart // Intel Corporation // Date: 12/22/2010 // // //*************************************************************************************** // // // Copyright 2010 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include #include #ifdef CONFIG_ROCKCREEK #include #ifdef RC_POWER_MANAGEMENT #include #endif #ifdef SCC //#include //#include #include #include #endif // #include // #include // #include // En-/ or disable debug prints... #define DEBUG 0 #define Test_and_Set(a) ((*(virtual_lockaddress[a])) & 0x01) //...................................................................................... // GLOBAL VARIABLES USED BY THE LIBRARY //...................................................................................... int RCCE_NP; // number of participating cores int RC_REFCLOCKMHZ; // baseline CPU frequency (MHz) int RC_MY_COREID; // physical ID of calling core int RC_COREID[RCCE_MAXNP]; // array of physical core IDs for all participating // cores, sorted by rank int RC_RCCEID[RCCE_MAXNP] = {[0 ... RCCE_MAXNP-1] = -1}; // array of RCCE IDs for all cores int RCCE_IAM=-1; // rank of calling core (invalid by default) RCCE_COMM RCCE_COMM_WORLD; // predefined global communicator int RCCE_BUFF_SIZE; // available MPB size t_vcharp RCCE_comm_buffer[RCCE_MAXNP]; // starts of MPB, sorted by rank #ifndef GORY // ......................... non-GORY communication mode ............................. // synchronization flags are predefined and maintained by the library RCCE_FLAG RCCE_sent_flag[RCCE_MAXNP], RCCE_ready_flag[RCCE_MAXNP]; // payload part of the MPBs starts at a specific address, not malloced space t_vcharp RCCE_buff_ptr; // maximum chunk size of message payload is also specified size_t RCCE_chunk; // synchronization flags will be allocated at this address t_vcharp RCCE_flags_start; #endif t_vcharp RCCE_fool_write_combine_buffer; #ifdef SCC // virtual addresses of test&set registers t_vcharp virtual_lockaddress[RCCE_MAXNP]; #endif //...................................................................................... // END GLOBAL VARIABLES USED BY THE LIBRARY //...................................................................................... #ifdef SCC #ifndef __INTEL_COMPILER inline volatile long long _rdtsc() { register long long TSC __asm__("eax"); __asm__ volatile (".byte 15, 49" : : : "eax", "edx"); return TSC; } #endif #endif //-------------------------------------------------------------------------------------- // FUNCTION: RC_cache_invalidate //-------------------------------------------------------------------------------------- // invalidate (not flush!) lines in L1 that map to MPB lines //-------------------------------------------------------------------------------------- void RC_cache_invalidate() { #ifdef SCC __asm__ volatile ( ".byte 0x0f; .byte 0x0a;\n" ); // CL1FLUSHMB #endif return; } int RCCE_TNS_barrier(RCCE_COMM* comm) { // two roundtrips to realize a barrier using a T&S Register for each core. // 1. search first free T&S Register to spin // 2. last waiter wakes up first waiter and continues local wait // 3. first waiter wakes up second waiter by releasing its lock ... // At least every used T&S Register is 0 and no UE can overtake a barrier. int num = comm->size; int step = 0; //fprintf(stderr,"%d:\t enter barrier \n",id); while( !Test_and_Set(step) ) ++step; // only one UE runs until T&S # num-1 //fprintf(stderr,"%d:\t step %d\n",id,step); if(step == num-1) { //fprintf(stderr,"%d:\t I am the last one\n",id); *(virtual_lockaddress[0]) = 0x0; while(!Test_and_Set(step)) ; *(virtual_lockaddress[step]) = 0x0; } else { while(!Test_and_Set(step)) ; *(virtual_lockaddress[step]) = 0x0; *(virtual_lockaddress[step+1]) = 0x0; } //fprintf(stderr,"released barrier! step: %d\n", step); return RCCE_SUCCESS; } //-------------------------------------------------------------------------------------- // FUNCTION: RC_COMM_BUFFER_SIZE //-------------------------------------------------------------------------------------- // return total available MPB size on chip //-------------------------------------------------------------------------------------- int RC_COMM_BUFFER_SIZE() { return RCCE_BUFF_SIZE_MAX*RCCE_MAXNP; } //-------------------------------------------------------------------------------------- // FUNCTION: RC_COMM_BUFFER_START //-------------------------------------------------------------------------------------- // return (virtual) start address of MPB for UE with rank ue //-------------------------------------------------------------------------------------- t_vcharp RC_COMM_BUFFER_START(int ue){ #ifdef SCC // "Allocate" MPB, using memory mapping of physical addresses t_vcharp retval; MPBalloc(&retval, X_PID(RC_COREID[ue]), Y_PID(RC_COREID[ue]), Z_PID(RC_COREID[ue]), (X_PID(RC_COREID[ue]) == X_PID(RC_COREID[RCCE_IAM])) && (Y_PID(RC_COREID[ue]) == Y_PID(RC_COREID[RCCE_IAM])) ); return retval; #else // even in functional emulation mode we leave gaps in the global MPB return RC_comm_buffer + RC_COREID[ue]*RC_COMM_BUFFER_SIZE()/RCCE_MAXNP; #endif } //-------------------------------------------------------------------------------------- // FUNCTION: RC_SHM_BUFFER_START //-------------------------------------------------------------------------------------- // return (virtual) start address of off-chip shared memory //-------------------------------------------------------------------------------------- t_vcharp RC_SHM_BUFFER_START(){ #ifdef SCC t_vcharp retval; SHMalloc(&retval); //SHMalloc() is in SCC_API.c return retval; #else return RC_shm_buffer; #endif } //-------------------------------------------------------------------------------------- // FUNCTION: MYCOREID //-------------------------------------------------------------------------------------- // return physical core ID of calling core //-------------------------------------------------------------------------------------- int MYCOREID() { #ifdef SCC int tmp, x, y, z; tmp=ReadConfigReg(CRB_OWN+MYTILEID); x=(tmp>>3) & 0x0f; // bits 06:03 y=(tmp>>7) & 0x0f; // bits 10:07 z=(tmp ) & 0x07; // bits 02:00 return ( ( x + ( 6 * y ) ) * 2 ) + z; // True Processor ID! #else // the COREIDs are read into the main program in potentially random order. // Each core can access its own Core ID. We simulate that by selecting // the value in the list of coreids that corresponds to the sequence // number of the OpenMP thread number return RC_COREID[omp_get_thread_num()]; #endif } //-------------------------------------------------------------------------------------- // FUNCTION: RCCE_acquire_lock //-------------------------------------------------------------------------------------- // acquire lock corresponding to core with rank ID //-------------------------------------------------------------------------------------- int RCCE_acquire_lock(int ID) { #ifdef SCC // semantics of test&set register: a read returns zero if another core has // previously read it and no reset has occurred since then. Otherwise, the read // returns one. Comparing (hex) one with the contents of the register forces a // read. As long as the comparison fails, we keep reading. while (!((*(virtual_lockaddress[ID])) & 0x01)); #else omp_set_lock(&(RCCE_corelock[ID])); #endif return(RCCE_SUCCESS); } //-------------------------------------------------------------------------------------- // FUNCTION: RCCE_release_lock //-------------------------------------------------------------------------------------- // release lock corresponding to core with rank ID //-------------------------------------------------------------------------------------- int RCCE_release_lock(int ID) { #ifdef SCC // semantics of test&set register: a write by _any_ core causes a reset *(virtual_lockaddress[ID]) = 0x0; #else omp_unset_lock(&(RCCE_corelock[ID])); #endif return RCCE_SUCCESS; } //-------------------------------------------------------------------------------------- // FUNCTION: RC_FREQUENCY //-------------------------------------------------------------------------------------- // return actual core clock frequency (Hz) //-------------------------------------------------------------------------------------- long long RC_FREQUENCY() { return (long long)(RC_REFCLOCKMHZ*1.e6); } //-------------------------------------------------------------------------------------- // FUNCTION: RCCE_init //-------------------------------------------------------------------------------------- // initialize the library and sanitize parameter list //-------------------------------------------------------------------------------------- int RCCE_init( int *argc, // pointer to argc, passed in from main program char ***argv // pointer to argv, passed in from main program ) { int i, ue, dummy_offset, loc, error; #ifdef SCC int x, y, z; unsigned int physical_lockaddress; #endif #ifdef SHMADD unsigned int RCCE_SHM_BUFFER_offset ,result, rd_slot_nbr, wr_slot_nbr; #endif void *nothing = NULL; #ifdef SCC // Copperridge specific initialization... InitAPI(0); //fflush(0) #endif // save pointer to executable name for later insertion into the argument list char *executable_name = (*argv)[0]; RCCE_NP = atoi(*(++(*argv))); RC_REFCLOCKMHZ = atoi(*(++(*argv))); // put the participating core ids (unsorted) into an array for (ue=0; ue