// 
// Copyright 2010 Intel Corporation
// 
//    Licensed under the Apache License, Version 2.0 (the "License");
//    you may not use this file except in compliance with the License.
//    You may obtain a copy of the License at
// 
//        http://www.apache.org/licenses/LICENSE-2.0
// 
//    Unless required by applicable law or agreed to in writing, software
//    distributed under the License is distributed on an "AS IS" BASIS,
//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//    See the License for the specific language governing permissions and
//    limitations under the License.
// 
//    [2010-10-25] added support for non-blocking send/recv operations
//                 - RCCE_isend(), ..._test(), ..._wait(), ..._push()
//                 - RCCE_irecv(), ..._test(), ..._wait(), ..._push()
//                 by Carsten Clauss, Chair for Operating Systems,
//                                    RWTH Aachen University
//
//    [2012-09-10] added support for "tagged" flags
//                 - RCCE_send_tagged(), RCCE_recv_tagged(), RCCE_recv_probe_tagged()
//                 by Carsten Clauss, Chair for Operating Systems,
//                                    RWTH Aachen University
//
//    [2015-10-18] port (i)RCCE to "HermitCore"
//                 by Stefan Lankes, Institute for Automation of Complex Power Systems
//                                   RWTH Aachen University

#ifndef RCCE_H
#define RCCE_H

#include <stdlib.h>
#include <stdio.h>

#ifdef __hermit__
#define SCC
#define COPPERRIDGE
#define USE_REMOTE_PUT_LOCAL_GET
#define USE_PROBE_FLAGS
#undef SHMADD
#endif

#define _RCCE "1.0.13 release"
// #define USE_BYTE_FLAGS
// #define USE_FLAG_EXPERIMENTAL
// little trick to allow the application to be called "RCCE_APP" under
// OpenMP, and "main" otherwise 

#define ABS(x) ((x > 0)?x:-x)

#if !defined(_OPENMP) || defined(__hermit__)
  #define RCCE_APP main
#endif

// modify next line for Intel BareMetal, which supports stdout, but not stdferr 
#define STDERR                             stdout

#ifdef __hermit__
#define LOG2_LINE_SIZE                     6
#else
#define LOG2_LINE_SIZE                     5
#endif
#define RCCE_LINE_SIZE                     (1<<LOG2_LINE_SIZE)
// RCCE_BUFF_SIZE_MAX is space per UE, which is half of the space per tile 
#ifdef __hermit__
#define RCCE_BUFF_SIZE_MAX                 (64*1024)
#else
#define RCCE_BUFF_SIZE_MAX                 (1<<13)
#endif

#ifdef SHMADD
//64MB
//#define RCCE_SHM_SIZE_MAX                0x4000000 
// 128MB
//#define RCCE_SHM_SIZE_MAX                0x8000000 
// 256MB
//#define RCCE_SHM_SIZE_MAX                0x10000000 
// 512MB
#define RCCE_SHM_SIZE_MAX                  0x20000000 
// 960MB
//#define RCCE_SHM_SIZE_MAX                0x3C000000 
#else
  #ifndef SCC_COUPLED_SYSTEMS
  // 64MB
  #define RCCE_SHM_SIZE_MAX                  (1<<26)
  #else
  // In Coupled Mode only 4MB
  #define RCCE_SHM_SIZE_MAX                  (1<<22)
  #endif
#endif

#ifdef __hermit__
#define RCCE_MAX_BOARDS			   1
#define RCCE_MAXNP_PER_BOARD               8
#else
#define RCCE_MAX_BOARDS                    2 /* allow up to 2 SCC boards for now */
#define RCCE_MAXNP_PER_BOARD               48
#endif
#define RCCE_MAXNP                         (RCCE_MAX_BOARDS * RCCE_MAXNP_PER_BOARD)
#define RCCE_SUCCESS                       0
#define RCCE_PENDING                       -1
#define RCCE_RESERVED                      -2
#define RCCE_REJECTED                      -3
#define RCCE_ERROR_BASE                    1234321
#define RCCE_ERROR_TARGET                  (RCCE_ERROR_BASE +  1)
#define RCCE_ERROR_SOURCE                  (RCCE_ERROR_BASE +  2)
#define RCCE_ERROR_ID                      (RCCE_ERROR_BASE +  3)
#define RCCE_ERROR_MESSAGE_LENGTH          (RCCE_ERROR_BASE +  4)
#define RCCE_ERROR_FLAG_UNDEFINED          (RCCE_ERROR_BASE +  5)
#define RCCE_ERROR_NUM_UES                 (RCCE_ERROR_BASE +  6)
#define RCCE_ERROR_DATA_OVERLAP            (RCCE_ERROR_BASE +  7)
#define RCCE_ERROR_ALIGNMENT               (RCCE_ERROR_BASE +  8)
#define RCCE_ERROR_DEBUG_FLAG              (RCCE_ERROR_BASE +  9)
#define RCCE_ERROR_FLAG_NOT_IN_COMM_BUFFER (RCCE_ERROR_BASE + 10)
#define RCCE_ERROR_FLAG_STATUS_UNDEFINED   (RCCE_ERROR_BASE + 11)
#define RCCE_ERROR_FLAG_NOT_ALLOCATED      (RCCE_ERROR_BASE + 12)
#define RCCE_ERROR_VAL_UNDEFINED           (RCCE_ERROR_BASE + 13)
#define RCCE_ERROR_INVALID_ERROR_CODE      (RCCE_ERROR_BASE + 14)
#define RCCE_ERROR_RPC_NOT_ALLOCATED       (RCCE_ERROR_BASE + 15)
#define RCCE_ERROR_RPC_INTERNAL            (RCCE_ERROR_BASE + 16)
#define RCCE_ERROR_MULTIPLE_RPC_REQUESTS   (RCCE_ERROR_BASE + 17)
#define RCCE_ERROR_FDIVIDER                (RCCE_ERROR_BASE + 18)
#define RCCE_ERROR_FREQUENCY_EXCEEDED      (RCCE_ERROR_BASE + 19)
#define RCCE_ERROR_NO_ACTIVE_RPC_REQUEST   (RCCE_ERROR_BASE + 20)
#define RCCE_ERROR_STALE_RPC_REQUEST       (RCCE_ERROR_BASE + 21)
#define RCCE_ERROR_COMM_UNDEFINED          (RCCE_ERROR_BASE + 22)
#define RCCE_ERROR_ILLEGAL_OP              (RCCE_ERROR_BASE + 23)
#define RCCE_ERROR_ILLEGAL_TYPE            (RCCE_ERROR_BASE + 24)
#define RCCE_ERROR_MALLOC                  (RCCE_ERROR_BASE + 25)
#define RCCE_ERROR_COMM_INITIALIZED        (RCCE_ERROR_BASE + 26)
#define RCCE_ERROR_CORE_NOT_IN_HOSTFILE    (RCCE_ERROR_BASE + 27)
#define RCCE_ERROR_NO_MULTICAST_SUPPORT    (RCCE_ERROR_BASE + 28)
#define RCCE_MAX_ERROR_STRING              45

#define RCCE_DEBUG_ALL                     111111
#define RCCE_DEBUG_SYNCH                   111444
#define RCCE_DEBUG_COMM                    111555
#define RCCE_DEBUG_RPC                     111666
#define RCCE_DEBUG_DEBUG                   111888

#define RCCE_FLAG_SET                      1
#define RCCE_FLAG_UNSET                    0

#define RCCE_NUM_OPS                       4
#define RCCE_OP_BASE                       23232323
#define RCCE_SUM                           (RCCE_OP_BASE)
#define RCCE_MIN                           (RCCE_OP_BASE+1)
#define RCCE_MAX                           (RCCE_OP_BASE+2)
#define RCCE_PROD                          (RCCE_OP_BASE+3)

#define RCCE_TYPE_BASE                     63636363
#define RCCE_INT                           (RCCE_TYPE_BASE)
#define RCCE_LONG                          (RCCE_TYPE_BASE+1)
#define RCCE_FLOAT                         (RCCE_TYPE_BASE+2)
#define RCCE_DOUBLE                        (RCCE_TYPE_BASE+3)

// MPB pointer type
typedef volatile unsigned char* t_vcharp;

#if (defined(SINGLEBITFLAGS) || defined(USE_BYTE_FLAGS)) && !defined(USE_FLAG_EXPERIMENTAL)
typedef struct {
   int  location;      /* location of bit within line (0-255)  */
   t_vcharp flag_addr; /* address of byte containing flag inside cache line */
   t_vcharp line_address; /* start of cache line containing flag  */
}  RCCE_FLAG;
#else
#ifdef USE_FLAG_EXPERIMENTAL
typedef volatile unsigned char *RCCE_FLAG;
#else
typedef volatile ssize_t *RCCE_FLAG;
#endif
#endif

#ifdef USE_FLAG_EXPERIMENTAL
typedef unsigned char RCCE_FLAG_STATUS;
#else
typedef ssize_t RCCE_FLAG_STATUS;
#endif

typedef struct {
  int size;
  int my_rank;
  int initialized;
  int member[RCCE_MAXNP];
#ifdef USE_FAT_BARRIER 
  RCCE_FLAG gather[RCCE_MAXNP];
#else
  RCCE_FLAG gather;
#endif
  RCCE_FLAG release;  
  volatile int cycle;
  volatile int count;
  int step;
  int label;
} RCCE_COMM;

typedef struct _RCCE_SEND_REQUEST {
  char *privbuf;    // source buffer in local private memory (send buffer)
  t_vcharp combuf;  // intermediate buffer in MPB
  size_t chunk;     // size of MPB available for this message (bytes)
  RCCE_FLAG *ready; // flag indicating whether receiver is ready
  RCCE_FLAG *sent;  // flag indicating whether message has been sent by source
  size_t size;      // size of message (bytes)
  int dest;         // UE that will receive the message

  int copy;         // set to 0 for synchronization only (no copying/sending)
  void* tag;        // additional tag?
  int len;          // length of additional tag
  RCCE_FLAG *probe; // flag for probing for incoming messages

  size_t wsize;     // offset within send buffer when putting in "chunk" bytes
  size_t remainder; // bytes remaining to be sent
  size_t nbytes;    // number of bytes to be sent in single RCCE_put call
  char *bufptr;     // running pointer inside privbuf for current location

  int label;        // jump/goto label for the reentrance of the respective poll function
  int finished;     // flag that indicates whether the request has already been finished

  struct _RCCE_SEND_REQUEST *next;
} RCCE_SEND_REQUEST;

typedef struct _RCCE_RECV_REQUEST {
  char *privbuf;    // source buffer in local private memory (send buffer)
  t_vcharp combuf;  // intermediate buffer in MPB
  size_t chunk;     // size of MPB available for this message (bytes)
  RCCE_FLAG *ready; // flag indicating whether receiver is ready
  RCCE_FLAG *sent;  // flag indicating whether message has been sent by source
  size_t size;      // size of message (bytes)
  int source;       // UE that will send the message

  int copy;         // set to 0 for cancel function
  void* tag;        // additional tag?
  int len;          // length of additional tag
  RCCE_FLAG *probe;  // flag for probing for incoming messages

  size_t wsize;     // offset within send buffer when putting in "chunk" bytes
  size_t remainder; // bytes remaining to be sent
  size_t nbytes;    // number of bytes to be sent in single RCCE_put call
  char *bufptr;     // running pointer inside privbuf for current location

  int label;        // jump/goto label for the reentrance of the respective poll function
  int finished;     // flag that indicates whether the request has already been finished

  struct _RCCE_RECV_REQUEST *next;
} RCCE_RECV_REQUEST;

typedef struct tree_s {
  int parent; // UE of parent
  int num_children;
  int child[RCCE_MAXNP]; // UEs of children
} tree_t;

#ifdef RC_POWER_MANAGEMENT
typedef struct{
    int release;
    int old_voltage_level;
    int new_voltage_level;
    int old_frequency_divider;
    int new_frequency_divider;
    long long start_cycle;
  } RCCE_REQUEST;
int RCCE_power_domain(void);
int RCCE_iset_power(int, RCCE_REQUEST *, int *, int *);
int RCCE_wait_power(RCCE_REQUEST *);
int RCCE_set_frequency_divider(int, int *);
int RCCE_power_domain_master(void);
int RCCE_power_domain_size(void);
#endif  

int    RCCE_init(int *, char***);
int    RCCE_finalize(void);
double RCCE_wtime(void);
int    RCCE_ue(void);
int    RCCE_num_ues(void);
#ifdef SCC_COUPLED_SYSTEMS
int RCCE_dev(void);
int RCCE_dev_ue(void);
int RCCE_num_dev(void);
int RCCE_num_ues_dev(int);
int RCCE_ue_to_dev(int);
#endif
#ifdef GORY
t_vcharp RCCE_malloc(size_t);
t_vcharp RCCE_malloc_request(size_t, size_t *);
t_vcharp RCCE_palloc(size_t,int);
void   RCCE_free(t_vcharp);
int    RCCE_put(t_vcharp, t_vcharp, int, int);
int    RCCE_get(t_vcharp, t_vcharp, int, int);
int    RCCE_wait_until(RCCE_FLAG, RCCE_FLAG_STATUS);
int    RCCE_test_flag(RCCE_FLAG, RCCE_FLAG_STATUS, int *);
int    RCCE_flag_alloc(RCCE_FLAG *);
int    RCCE_flag_free(RCCE_FLAG *);
int    RCCE_flag_write(RCCE_FLAG *, RCCE_FLAG_STATUS, int);
int    RCCE_flag_read(RCCE_FLAG, RCCE_FLAG_STATUS *, int);
int    RCCE_flag_write_tagged(RCCE_FLAG *, RCCE_FLAG_STATUS, int, char*, int); 
int    RCCE_flag_read_tagged(RCCE_FLAG, RCCE_FLAG_STATUS *, int, char*, int);
int    RCCE_send(char *, t_vcharp, size_t, RCCE_FLAG *, RCCE_FLAG *, size_t, int);
int    RCCE_recv(char *, t_vcharp, size_t, RCCE_FLAG *, RCCE_FLAG *, size_t, int, RCCE_FLAG *);
int    RCCE_recv_test(char *, t_vcharp, size_t, RCCE_FLAG *, RCCE_FLAG *, size_t, int, int *, RCCE_FLAG *);
#ifdef USE_FLAG_EXPERIMENTAL
int    RCCE_put_flag(t_vcharp, t_vcharp, int, int);
int    RCCE_get_flag(t_vcharp, t_vcharp, int, int);
#endif
#else
// standard non-gory functions:

t_vcharp RCCE_malloc(size_t);

int    RCCE_flag_write(RCCE_FLAG *, RCCE_FLAG_STATUS, int);
int    RCCE_flag_read(RCCE_FLAG, RCCE_FLAG_STATUS *, int);

int    RCCE_send(char *, size_t, int);
int    RCCE_recv(char *, size_t, int);
int    RCCE_recv_test(char *, size_t, int, int *);
int    RCCE_send_pipe(char *, size_t, int);
int    RCCE_recv_pipe(char *, size_t, int);
int    RCCE_send_mcast(char *, size_t);
int    RCCE_recv_mcast(char *, size_t, int);
int    RCCE_send_tagged(char *, size_t, int, void *, int);
int    RCCE_recv_tagged(char *, size_t, int, void *, int);
int    RCCE_recv_probe_tagged(int, int *, t_vcharp *, void *, int);
int    RCCE_allreduce(char *, char *, int, int, int, RCCE_COMM);
int    RCCE_reduce(char *, char *, int, int, int, int, RCCE_COMM);
int    RCCE_bcast(char *, size_t, int, RCCE_COMM);
int    RCCE_recv_probe(int, int *, t_vcharp *);
int    RCCE_recv_cancel(size_t, int);
int    RCCE_isend(char *, size_t, int, RCCE_SEND_REQUEST *);
int    RCCE_isend_test(RCCE_SEND_REQUEST *, int *);
int    RCCE_isend_wait(RCCE_SEND_REQUEST *);
int    RCCE_isend_push(int);
int    RCCE_irecv(char *, size_t, int, RCCE_RECV_REQUEST *);
int    RCCE_irecv_test(RCCE_RECV_REQUEST *, int *);
int    RCCE_irecv_wait(RCCE_RECV_REQUEST *);
int    RCCE_irecv_push(int);

#endif
t_vcharp RCCE_shmalloc(size_t);
void     RCCE_shfree(t_vcharp);
void     RCCE_shflush(void);
t_vcharp RCCE_shrealloc(t_vcharp, size_t);

// LfBS-customized functions:
void*  RCCE_memcpy_get(void *, const void *, size_t);
void*  RCCE_memcpy_put(void *, const void *, size_t);
#define RCCE_memcpy(a,b,c) RCCE_memcpy_put(a,b,c)

int    RCCE_comm_split(int (*)(int, void *), void *, RCCE_COMM *);
int    RCCE_comm_free(RCCE_COMM *);
int    RCCE_comm_size(RCCE_COMM, int *);
int    RCCE_comm_rank(RCCE_COMM, int *);
void   RCCE_fence(void);
int    RCCE_barrier(RCCE_COMM *);
int    RCCE_tree_init(RCCE_COMM *, tree_t *, int);
int    RCCE_tree_barrier(RCCE_COMM *, tree_t *);
int    RCCE_tournament_barrier(RCCE_COMM *);
int    RCCE_tournament_fixed_barrier(RCCE_COMM *);
int    RCCE_dissemination_barrier(RCCE_COMM *);
int    RCCE_TNS_barrier(RCCE_COMM *);
int    RCCE_AIR_barrier(RCCE_COMM *);
int    RCCE_AIR_barrier2(RCCE_COMM *);
int    RCCE_nb_barrier(RCCE_COMM *);
int    RCCE_nb_TNS_barrier(RCCE_COMM *);
int    RCCE_nb_AIR_barrier(RCCE_COMM *);
int    RCCE_error_string(int, char *, int *);
int    RCCE_debug_set(int);
int    RCCE_debug_unset(int);

extern RCCE_COMM    RCCE_COMM_WORLD;
#ifdef RC_POWER_MANAGEMENT
extern RCCE_COMM    RCCE_P_COMM;
#define RCCE_POWER_DEFAULT -99999
#endif

#if defined(_OPENMP) && !defined(__hermit__)
#pragma omp threadprivate (RCCE_COMM_WORLD)
#ifdef RC_POWER_MANAGEMENT
#pragma omp threadprivate (RCCE_P_COMM)
#endif
#endif

#endif