tvheadend/ffdecsa/FFdecsa.c

/* FFdecsa -- fast decsa algorithm
 *
 * Copyright (C) 2003-2004  fatih89r
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


#include <sys/types.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#include "FFdecsa.h"

#ifndef NULL
#define NULL 0
#endif

//#define DEBUG
#ifdef DEBUG
#define DBG(a) a
#else
#define DBG(a)
#endif

//// parallelization stuff, large speed differences are possible
// possible choices
#define PARALLEL_32_4CHAR     320
#define PARALLEL_32_4CHARA    321
#define PARALLEL_32_INT       322
#define PARALLEL_64_8CHAR     640
#define PARALLEL_64_8CHARA    641
#define PARALLEL_64_2INT      642
#define PARALLEL_64_LONG      643
#define PARALLEL_64_MMX       644
#define PARALLEL_128_16CHAR  1280
#define PARALLEL_128_16CHARA 1281
#define PARALLEL_128_4INT    1282
#define PARALLEL_128_2LONG   1283
#define PARALLEL_128_2MMX    1284
#define PARALLEL_128_SSE     1285
#define PARALLEL_128_SSE2    1286

//////// our choice //////////////// our choice //////////////// our choice //////////////// our choice ////////
#ifndef PARALLEL_MODE
#define PARALLEL_MODE PARALLEL_64_MMX
#endif
//////// our choice //////////////// our choice //////////////// our choice //////////////// our choice ////////

#include "parallel_generic.h"
//// conditionals
#if PARALLEL_MODE==PARALLEL_32_4CHAR
#include "parallel_032_4char.h"
#elif PARALLEL_MODE==PARALLEL_32_4CHARA
#include "parallel_032_4charA.h"
#elif PARALLEL_MODE==PARALLEL_32_INT
#include "parallel_032_int.h"
#elif PARALLEL_MODE==PARALLEL_64_8CHAR
#include "parallel_064_8char.h"
#elif PARALLEL_MODE==PARALLEL_64_8CHARA
#include "parallel_064_8charA.h"
#elif PARALLEL_MODE==PARALLEL_64_2INT
#include "parallel_064_2int.h"
#elif PARALLEL_MODE==PARALLEL_64_LONG
#include "parallel_064_long.h"
#elif PARALLEL_MODE==PARALLEL_64_MMX
#include "parallel_064_mmx.h"
#elif PARALLEL_MODE==PARALLEL_128_16CHAR
#include "parallel_128_16char.h"
#elif PARALLEL_MODE==PARALLEL_128_16CHARA
#include "parallel_128_16charA.h"
#elif PARALLEL_MODE==PARALLEL_128_4INT
#include "parallel_128_4int.h"
#elif PARALLEL_MODE==PARALLEL_128_2LONG
#include "parallel_128_2long.h"
#elif PARALLEL_MODE==PARALLEL_128_2MMX
#include "parallel_128_2mmx.h"
#elif PARALLEL_MODE==PARALLEL_128_SSE
#include "parallel_128_sse.h"
#elif PARALLEL_MODE==PARALLEL_128_SSE2
#include "parallel_128_sse2.h"
#else
#error "unknown/undefined parallel mode"
#endif

// stuff depending on conditionals

#define BYTES_PER_GROUP (GROUP_PARALLELISM/8)
#define BYPG BYTES_PER_GROUP
#define BITS_PER_GROUP GROUP_PARALLELISM
#define BIPG BITS_PER_GROUP

#ifndef MALLOC
#define MALLOC(X) malloc(X)
#endif
#ifndef FREE
#define FREE(X) free(X)
#endif
#ifndef MEMALIGN
#define MEMALIGN
#endif

//// debug tool

#if 0
static void dump_mem(const char *string, const unsigned char *p, int len, int linelen){
  int i;
  for(i=0;i<len;i++){
    if(i%linelen==0&&i) fprintf(stderr,"\n");
    if(i%linelen==0) fprintf(stderr,"%s %08x:",string,i);
    else{
      if(i%8==0) fprintf(stderr," ");
      if(i%4==0) fprintf(stderr," ");
    }
    fprintf(stderr," %02x",p[i]);
  }
  if(i%linelen==0) fprintf(stderr,"\n");
}
#endif

//////////////////////////////////////////////////////////////////////////////////

struct csa_key_t{
	unsigned char ck[8];
// used by stream
        int iA[8];  // iA[0] is for A1, iA[7] is for A8
        int iB[8];  // iB[0] is for B1, iB[7] is for B8
// used by stream (group)
        MEMALIGN group ck_g[8][8]; // [byte][bit:0=LSB,7=MSB]
        MEMALIGN group iA_g[8][4]; // [0 for A1][0 for LSB]
        MEMALIGN group iB_g[8][4]; // [0 for B1][0 for LSB]
// used by block
	unsigned char kk[56];
// used by block (group)
	MEMALIGN batch kkmulti[56]; // many times the same byte in every batch
};

struct csa_keys_t{
  struct csa_key_t even;
  struct csa_key_t odd;
};

//-----stream cypher

//-----key schedule for stream decypher
static void key_schedule_stream(
  unsigned char *ck,    // [In]  ck[0]-ck[7]   8 bytes   | Key.
  int *iA,              // [Out] iA[0]-iA[7]   8 nibbles | Key schedule.
  int *iB)              // [Out] iB[0]-iB[7]   8 nibbles | Key schedule.
{
    iA[0]=(ck[0]>>4)&0xf;
    iA[1]=(ck[0]   )&0xf;
    iA[2]=(ck[1]>>4)&0xf;
    iA[3]=(ck[1]   )&0xf;
    iA[4]=(ck[2]>>4)&0xf;
    iA[5]=(ck[2]   )&0xf;
    iA[6]=(ck[3]>>4)&0xf;
    iA[7]=(ck[3]   )&0xf;
    iB[0]=(ck[4]>>4)&0xf;
    iB[1]=(ck[4]   )&0xf;
    iB[2]=(ck[5]>>4)&0xf;
    iB[3]=(ck[5]   )&0xf;
    iB[4]=(ck[6]>>4)&0xf;
    iB[5]=(ck[6]   )&0xf;
    iB[6]=(ck[7]>>4)&0xf;
    iB[7]=(ck[7]   )&0xf;
}

//----- stream main function

#define STREAM_INIT
#include "stream.c"
#undef STREAM_INIT

#define STREAM_NORMAL
#include "stream.c"
#undef STREAM_NORMAL


//-----block decypher

//-----key schedule for block decypher

static void key_schedule_block(
  unsigned char *ck,    // [In]  ck[0]-ck[7]   8 bytes | Key.
  unsigned char *kk)    // [Out] kk[0]-kk[55] 56 bytes | Key schedule.
{
  static const unsigned char key_perm[0x40] = {
    0x12,0x24,0x09,0x07,0x2A,0x31,0x1D,0x15, 0x1C,0x36,0x3E,0x32,0x13,0x21,0x3B,0x40,
    0x18,0x14,0x25,0x27,0x02,0x35,0x1B,0x01, 0x22,0x04,0x0D,0x0E,0x39,0x28,0x1A,0x29,
    0x33,0x23,0x34,0x0C,0x16,0x30,0x1E,0x3A, 0x2D,0x1F,0x08,0x19,0x17,0x2F,0x3D,0x11,
    0x3C,0x05,0x38,0x2B,0x0B,0x06,0x0A,0x2C, 0x20,0x3F,0x2E,0x0F,0x03,0x26,0x10,0x37,
  };

  int i,j,k;
  int bit[64];
  int newbit[64];
  int kb[7][8];

  // 56 steps
  // 56 key bytes kk(55)..kk(0) by key schedule from ck

  // kb(6,0) .. kb(6,7) = ck(0) .. ck(7)
  kb[6][0] = ck[0];
  kb[6][1] = ck[1];
  kb[6][2] = ck[2];
  kb[6][3] = ck[3];
  kb[6][4] = ck[4];
  kb[6][5] = ck[5];
  kb[6][6] = ck[6];
  kb[6][7] = ck[7];

  // calculate kb[5] .. kb[0]
  for(i=5; i>=0; i--){
    // 64 bit perm on kb
    for(j=0; j<8; j++){
      for(k=0; k<8; k++){
        bit[j*8+k] = (kb[i+1][j] >> (7-k)) & 1;
        newbit[key_perm[j*8+k]-1] = bit[j*8+k];
      }
    }
    for(j=0; j<8; j++){
      kb[i][j] = 0;
      for(k=0; k<8; k++){
        kb[i][j] |= newbit[j*8+k] << (7-k);
      }
    }
  }

  // xor to give kk
  for(i=0; i<7; i++){
    for(j=0; j<8; j++){
      kk[i*8+j] = kb[i][j] ^ i;
    }
  }

}

//-----block utils

static inline __attribute__((always_inline)) void trasp_N_8 (unsigned char *in,unsigned char* out,int count){
  int *ri=(int *)in;
  int *ibi=(int *)out;
  int j,i,k,g;
  // copy and first step
  for(g=0;g<count;g++){
    ri[g]=ibi[2*g];
    ri[GROUP_PARALLELISM+g]=ibi[2*g+1];
  }
//dump_mem("NE1 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
// now 01230123
#define INTS_PER_ROW (GROUP_PARALLELISM/8*2)
  for(j=0;j<8;j+=4){
    for(i=0;i<2;i++){
      for(k=0;k<INTS_PER_ROW;k++){
        unsigned int t,b;
        t=ri[INTS_PER_ROW*(j+i)+k];
        b=ri[INTS_PER_ROW*(j+i+2)+k];
        ri[INTS_PER_ROW*(j+i)+k]=     (t&0x0000ffff)      | ((b           )<<16);
        ri[INTS_PER_ROW*(j+i+2)+k]=  ((t           )>>16) |  (b&0xffff0000) ;
      }
    }
  }
//dump_mem("NE2 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
// now 01010101
  for(j=0;j<8;j+=2){
    for(i=0;i<1;i++){
      for(k=0;k<INTS_PER_ROW;k++){
        unsigned int t,b;
        t=ri[INTS_PER_ROW*(j+i)+k];
        b=ri[INTS_PER_ROW*(j+i+1)+k];
        ri[INTS_PER_ROW*(j+i)+k]=     (t&0x00ff00ff)     | ((b&0x00ff00ff)<<8);
        ri[INTS_PER_ROW*(j+i+1)+k]=  ((t&0xff00ff00)>>8) |  (b&0xff00ff00);
      }
    }
  }
//dump_mem("NE3 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
// now 00000000
}

static inline __attribute__((always_inline)) void trasp_8_N (unsigned char *in,unsigned char* out,int count){
  int *ri=(int *)in;
  int *bdi=(int *)out;
  int j,i,k,g;
#define INTS_PER_ROW (GROUP_PARALLELISM/8*2)
//dump_mem("NE1 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
// now 00000000
  for(j=0;j<8;j+=2){
    for(i=0;i<1;i++){
      for(k=0;k<INTS_PER_ROW;k++){
        unsigned int t,b;
        t=ri[INTS_PER_ROW*(j+i)+k];
        b=ri[INTS_PER_ROW*(j+i+1)+k];
        ri[INTS_PER_ROW*(j+i)+k]=     (t&0x00ff00ff)     | ((b&0x00ff00ff)<<8);
        ri[INTS_PER_ROW*(j+i+1)+k]=  ((t&0xff00ff00)>>8) |  (b&0xff00ff00);
      }
    }
  }
//dump_mem("NE2 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
// now 01010101
  for(j=0;j<8;j+=4){
    for(i=0;i<2;i++){
      for(k=0;k<INTS_PER_ROW;k++){
        unsigned int t,b;
        t=ri[INTS_PER_ROW*(j+i)+k];
        b=ri[INTS_PER_ROW*(j+i+2)+k];
        ri[INTS_PER_ROW*(j+i)+k]=     (t&0x0000ffff)      | ((b           )<<16);
        ri[INTS_PER_ROW*(j+i+2)+k]=  ((t           )>>16) |  (b&0xffff0000) ;
      }
    }
  }
//dump_mem("NE3 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
// now 01230123
  for(g=0;g<count;g++){
    bdi[2*g]=ri[g];
    bdi[2*g+1]=ri[GROUP_PARALLELISM+g];
  }
}

//-----block main function

// block group
static void block_decypher_group(
  batch *kkmulti,       // [In]  kkmulti[0]-kkmulti[55] 56 batches | Key schedule (each batch has repeated equal bytes).
  unsigned char *ib,    // [In]  (ib0,ib1,...ib7)...x32 32*8 bytes | Initialization vector.
  unsigned char *bd,    // [Out] (bd0,bd1,...bd7)...x32 32*8 bytes | Block decipher.
  int count)
{
  // int is faster than unsigned char. apparently not
  static const unsigned char block_sbox[0x100] = {
    0x3A,0xEA,0x68,0xFE,0x33,0xE9,0x88,0x1A, 0x83,0xCF,0xE1,0x7F,0xBA,0xE2,0x38,0x12,
    0xE8,0x27,0x61,0x95,0x0C,0x36,0xE5,0x70, 0xA2,0x06,0x82,0x7C,0x17,0xA3,0x26,0x49,
    0xBE,0x7A,0x6D,0x47,0xC1,0x51,0x8F,0xF3, 0xCC,0x5B,0x67,0xBD,0xCD,0x18,0x08,0xC9,
    0xFF,0x69,0xEF,0x03,0x4E,0x48,0x4A,0x84, 0x3F,0xB4,0x10,0x04,0xDC,0xF5,0x5C,0xC6,
    0x16,0xAB,0xAC,0x4C,0xF1,0x6A,0x2F,0x3C, 0x3B,0xD4,0xD5,0x94,0xD0,0xC4,0x63,0x62,
    0x71,0xA1,0xF9,0x4F,0x2E,0xAA,0xC5,0x56, 0xE3,0x39,0x93,0xCE,0x65,0x64,0xE4,0x58,
    0x6C,0x19,0x42,0x79,0xDD,0xEE,0x96,0xF6, 0x8A,0xEC,0x1E,0x85,0x53,0x45,0xDE,0xBB,
    0x7E,0x0A,0x9A,0x13,0x2A,0x9D,0xC2,0x5E, 0x5A,0x1F,0x32,0x35,0x9C,0xA8,0x73,0x30,

    0x29,0x3D,0xE7,0x92,0x87,0x1B,0x2B,0x4B, 0xA5,0x57,0x97,0x40,0x15,0xE6,0xBC,0x0E,
    0xEB,0xC3,0x34,0x2D,0xB8,0x44,0x25,0xA4, 0x1C,0xC7,0x23,0xED,0x90,0x6E,0x50,0x00,
    0x99,0x9E,0x4D,0xD9,0xDA,0x8D,0x6F,0x5F, 0x3E,0xD7,0x21,0x74,0x86,0xDF,0x6B,0x05,
    0x8E,0x5D,0x37,0x11,0xD2,0x28,0x75,0xD6, 0xA7,0x77,0x24,0xBF,0xF0,0xB0,0x02,0xB7,
    0xF8,0xFC,0x81,0x09,0xB1,0x01,0x76,0x91, 0x7D,0x0F,0xC8,0xA0,0xF2,0xCB,0x78,0x60,
    0xD1,0xF7,0xE0,0xB5,0x98,0x22,0xB3,0x20, 0x1D,0xA6,0xDB,0x7B,0x59,0x9F,0xAE,0x31,
    0xFB,0xD3,0xB6,0xCA,0x43,0x72,0x07,0xF4, 0xD8,0x41,0x14,0x55,0x0D,0x54,0x8B,0xB9,
    0xAD,0x46,0x0B,0xAF,0x80,0x52,0x2C,0xFA, 0x8C,0x89,0x66,0xFD,0xB2,0xA9,0x9B,0xC0,
  };
  MEMALIGN unsigned char r[GROUP_PARALLELISM*(8+56)];  /* 56 because we will move back in memory while looping */
  MEMALIGN unsigned char sbox_in[GROUP_PARALLELISM],sbox_out[GROUP_PARALLELISM],perm_out[GROUP_PARALLELISM];
  int roff;
  int i,g,count_all=GROUP_PARALLELISM;

  roff=GROUP_PARALLELISM*56;

#define FASTTRASP1
#ifndef FASTTRASP1
  for(g=0;g<count;g++){
    // Init registers
    int j;
    for(j=0;j<8;j++){
      r[roff+GROUP_PARALLELISM*j+g]=ib[8*g+j];
    }
  }
#else
  trasp_N_8((unsigned char *)&r[roff],(unsigned char *)ib,count);
#endif
//dump_mem("OLD r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);

  // loop over kk[55]..kk[0]
  for(i=55;i>=0;i--){
    {
      MEMALIGN batch tkkmulti=kkmulti[i];
      batch *si=(batch *)sbox_in;
      batch *r6_N=(batch *)(r+roff+GROUP_PARALLELISM*6);
      for(g=0;g<count_all/BYTES_PER_BATCH;g++){
        si[g]=B_FFXOR(tkkmulti,r6_N[g]);              //FIXME: introduce FASTBATCH?
      }
    }

    // table lookup, this works on only one byte at a time
    // most difficult part of all
    // - can't be parallelized
    // - can't be synthetized through boolean terms (8 input bits are too many)
    for(g=0;g<count_all;g++){
      sbox_out[g]=block_sbox[sbox_in[g]];
    }

    // bit permutation
    {
      unsigned char *po=(unsigned char *)perm_out;
      unsigned char *so=(unsigned char *)sbox_out;
//dump_mem("pre perm ",(unsigned char *)so,GROUP_PARALLELISM,GROUP_PARALLELISM);
      for(g=0;g<count_all;g+=BYTES_PER_BATCH){
        MEMALIGN batch in,out;
        in=*(batch *)&so[g];

        out=B_FFOR(
	    B_FFOR(
	    B_FFOR(
	    B_FFOR(
	    B_FFOR(
	           B_FFSH8L(B_FFAND(in,B_FFN_ALL_29()),1),
	           B_FFSH8L(B_FFAND(in,B_FFN_ALL_02()),6)),
	           B_FFSH8L(B_FFAND(in,B_FFN_ALL_04()),3)),
	           B_FFSH8R(B_FFAND(in,B_FFN_ALL_10()),2)),
	           B_FFSH8R(B_FFAND(in,B_FFN_ALL_40()),6)),
	           B_FFSH8R(B_FFAND(in,B_FFN_ALL_80()),4));

        *(batch *)&po[g]=out;
      }
//dump_mem("post perm",(unsigned char *)po,GROUP_PARALLELISM,GROUP_PARALLELISM);
    }

    roff-=GROUP_PARALLELISM; /* virtual shift of registers */

#if 0
/* one by one */
    for(g=0;g<count_all;g++){
      r[roff+GROUP_PARALLELISM*0+g]=r[roff+GROUP_PARALLELISM*8+g]^sbox_out[g];
      r[roff+GROUP_PARALLELISM*6+g]^=perm_out[g];
      r[roff+GROUP_PARALLELISM*4+g]^=r[roff+GROUP_PARALLELISM*0+g];
      r[roff+GROUP_PARALLELISM*3+g]^=r[roff+GROUP_PARALLELISM*0+g];
      r[roff+GROUP_PARALLELISM*2+g]^=r[roff+GROUP_PARALLELISM*0+g];
    }
#else
    for(g=0;g<count_all;g+=BEST_SPAN){
      XOR_BEST_BY(&r[roff+GROUP_PARALLELISM*0+g],&r[roff+GROUP_PARALLELISM*8+g],&sbox_out[g]);
      XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*6+g],&perm_out[g]);
      XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*4+g],&r[roff+GROUP_PARALLELISM*0+g]);
      XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*3+g],&r[roff+GROUP_PARALLELISM*0+g]);
      XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*2+g],&r[roff+GROUP_PARALLELISM*0+g]);
    }
#endif
  }

#define FASTTRASP2
#ifndef FASTTRASP2
  for(g=0;g<count;g++){
    // Copy results
    int j;
    for(j=0;j<8;j++){
      bd[8*g+j]=r[roff+GROUP_PARALLELISM*j+g];
    }
  }
#else
  trasp_8_N((unsigned char *)&r[roff],(unsigned char *)bd,count);
#endif
}

//-----------------------------------EXTERNAL INTERFACE

//-----get internal parallelism

int get_internal_parallelism(void){
  return GROUP_PARALLELISM;
}

//-----get suggested cluster size

int get_suggested_cluster_size(void){
  int r;
  r=GROUP_PARALLELISM+GROUP_PARALLELISM/10;
  if(r<GROUP_PARALLELISM+5) r=GROUP_PARALLELISM+5;
  return r;
}

//-----key structure

void *get_key_struct(void){
  struct csa_keys_t *keys=(struct csa_keys_t *)MALLOC(sizeof(struct csa_keys_t));
  if(keys) {
    static const unsigned char pk[8] = { 0,0,0,0,0,0,0,0 };
    set_control_words(keys,pk,pk);
    }
  return keys;
}

void free_key_struct(void *keys){
  return FREE(keys);
}

//-----set control words

static void schedule_key(struct csa_key_t *key, const unsigned char *pk){
  // could be made faster, but is not run often
  int bi,by;
  int i,j;
// key
  memcpy(key->ck,pk,8);
// precalculations for stream
  key_schedule_stream(key->ck,key->iA,key->iB);
  for(by=0;by<8;by++){
    for(bi=0;bi<8;bi++){
      key->ck_g[by][bi]=(key->ck[by]&(1<<bi))?FF1():FF0();
    }
  }
  for(by=0;by<8;by++){
    for(bi=0;bi<4;bi++){
      key->iA_g[by][bi]=(key->iA[by]&(1<<bi))?FF1():FF0();
      key->iB_g[by][bi]=(key->iB[by]&(1<<bi))?FF1():FF0();
    }
  }
// precalculations for block
  key_schedule_block(key->ck,key->kk);
  for(i=0;i<56;i++){
    for(j=0;j<BYTES_PER_BATCH;j++){
      *(((unsigned char *)&key->kkmulti[i])+j)=key->kk[i];
    }
  }
}

void set_control_words(void *keys, const unsigned char *ev, const unsigned char *od){
  schedule_key(&((struct csa_keys_t *)keys)->even,ev);
  schedule_key(&((struct csa_keys_t *)keys)->odd,od);
}

void set_even_control_word(void *keys, const unsigned char *pk){
  schedule_key(&((struct csa_keys_t *)keys)->even,pk);
}

void set_odd_control_word(void *keys, const unsigned char *pk){
  schedule_key(&((struct csa_keys_t *)keys)->odd,pk);
}

//-----get control words
#if 0
void get_control_words(void *keys, unsigned char *even, unsigned char *odd){
  memcpy(even,&((struct csa_keys_t *)keys)->even.ck,8);
  memcpy(odd,&((struct csa_keys_t *)keys)->odd.ck,8);
}
#endif

//----- decrypt

int decrypt_packets(void *keys, unsigned char **cluster){
  // statistics, currently unused
  int stat_no_scramble=0;
  int stat_reserved=0;
  int stat_decrypted[2]={0,0};
  int stat_decrypted_mini=0;
  unsigned char **clst;
  unsigned char **clst2;
  int grouped;
  int group_ev_od;
  int advanced;
  int can_advance;
  unsigned char *g_pkt[GROUP_PARALLELISM];
  int g_len[GROUP_PARALLELISM];
  int g_offset[GROUP_PARALLELISM];
  int g_n[GROUP_PARALLELISM];
  int g_residue[GROUP_PARALLELISM];
  unsigned char *pkt;
  int xc0,ev_od,len,offset,n,residue;
  struct csa_key_t* k;
  int i,j,iter,g;
  int t23,tsmall;
  int alive[24];
//icc craziness  int pad1=0; //////////align! FIXME
  unsigned char *encp[GROUP_PARALLELISM];
  unsigned char stream_in[GROUP_PARALLELISM*8];
  unsigned char stream_out[GROUP_PARALLELISM*8];
  MEMALIGN unsigned char ib[GROUP_PARALLELISM*8];
  MEMALIGN unsigned char block_out[GROUP_PARALLELISM*8];
  struct stream_regs regs;

//icc craziness  i=(int)&pad1;//////////align!!! FIXME

  // build a list of packets to be processed
  clst=cluster;
  grouped=0;
  advanced=0;
  can_advance=1;
  group_ev_od=-1; // silence incorrect compiler warning
  pkt=*clst;
  do{ // find a new packet
    if(grouped==GROUP_PARALLELISM){
      // full
      break;
    }
    if(pkt==NULL){
      // no more ranges
      break;
    }
    if(pkt>=*(clst+1)){
      // out of this range, try next
      clst++;clst++;
      pkt=*clst;
      continue;
    }

    do{ // handle this packet
      xc0=pkt[3]&0xc0;
      DBG(fprintf(stderr,"   exam pkt=%p, xc0=%02x, can_adv=%i\n",pkt,xc0,can_advance));
      if(xc0==0x00){
        DBG(fprintf(stderr,"skip clear pkt %p (can_advance is %i)\n",pkt,can_advance));
        advanced+=can_advance;
        stat_no_scramble++;
        break;
      }
      if(xc0==0x40){
        DBG(fprintf(stderr,"skip reserved pkt %p (can_advance is %i)\n",pkt,can_advance));
        advanced+=can_advance;
        stat_reserved++;
        break;
      }
      if(xc0==0x80||xc0==0xc0){ // encrypted
        ev_od=(xc0&0x40)>>6; // 0 even, 1 odd
        if(grouped==0) group_ev_od=ev_od; // this group will be all even (or odd)
        if(group_ev_od==ev_od){ // could be added to group
          pkt[3]&=0x3f;  // consider it decrypted now
          if(pkt[3]&0x20){ // incomplete packet
            offset=4+pkt[4]+1;
            len=188-offset;
            n=len>>3;
            residue=len-(n<<3);
            if(n==0){ // decrypted==encrypted!
              DBG(fprintf(stderr,"DECRYPTED MINI! (can_advance is %i)\n",can_advance));
              advanced+=can_advance;
              stat_decrypted_mini++;
              break; // this doesn't need more processing
            }
          }else{
            len=184;
            offset=4;
            n=23;
            residue=0;
          }
          g_pkt[grouped]=pkt;
          g_len[grouped]=len;
          g_offset[grouped]=offset;
          g_n[grouped]=n;
          g_residue[grouped]=residue;
          DBG(fprintf(stderr,"%2i: eo=%i pkt=%p len=%03i n=%2i residue=%i\n",grouped,ev_od,pkt,len,n,residue));
          grouped++;
          advanced+=can_advance;
          stat_decrypted[ev_od]++;
        }
        else{
          can_advance=0;
          DBG(fprintf(stderr,"skip pkt %p and can_advance set to 0\n",pkt));
          break; // skip and go on
        }
      }
    } while(0);

    if(can_advance){
      // move range start forward
      *clst+=188;
    }
    // next packet, if there is one
    pkt+=188;
  } while(1);
  DBG(fprintf(stderr,"-- result: grouped %i pkts, advanced %i pkts\n",grouped,advanced));

  // delete empty ranges and compact list
  clst2=cluster;
  for(clst=cluster;*clst!=NULL;clst+=2){
    // if not empty
    if(*clst<*(clst+1)){
      // it will remain
      *clst2=*clst;
      *(clst2+1)=*(clst+1);
      clst2+=2;
    }
  }
  *clst2=NULL;

  if(grouped==0){
    // no processing needed
    return advanced;
  }

  //  sort them, longest payload first
  //  we expect many n=23 packets and a few n<23
  DBG(fprintf(stderr,"PRESORTING\n"));
  for(i=0;i<grouped;i++){
    DBG(fprintf(stderr,"%2i of %2i: pkt=%p len=%03i n=%2i residue=%i\n",i,grouped,g_pkt[i],g_len[i],g_n[i],g_residue[i]));
    }
  // grouped is always <= GROUP_PARALLELISM

#define g_swap(a,b) \
    pkt=g_pkt[a]; \
    g_pkt[a]=g_pkt[b]; \
    g_pkt[b]=pkt; \
\
    len=g_len[a]; \
    g_len[a]=g_len[b]; \
    g_len[b]=len; \
\
    offset=g_offset[a]; \
    g_offset[a]=g_offset[b]; \
    g_offset[b]=offset; \
\
    n=g_n[a]; \
    g_n[a]=g_n[b]; \
    g_n[b]=n; \
\
    residue=g_residue[a]; \
    g_residue[a]=g_residue[b]; \
    g_residue[b]=residue;

  // step 1: move n=23 packets before small packets
  t23=0;
  tsmall=grouped-1;
  for(;;){
    for(;t23<grouped;t23++){
      if(g_n[t23]!=23) break;
    }
DBG(fprintf(stderr,"t23 after for =%i\n",t23));

    for(;tsmall>=0;tsmall--){
      if(g_n[tsmall]==23) break;
    }
DBG(fprintf(stderr,"tsmall after for =%i\n",tsmall));

    if(tsmall-t23<1) break;

DBG(fprintf(stderr,"swap t23=%i,tsmall=%i\n",t23,tsmall));

    g_swap(t23,tsmall);

    t23++;
    tsmall--;
DBG(fprintf(stderr,"new t23=%i,tsmall=%i\n\n",t23,tsmall));
  }
  DBG(fprintf(stderr,"packets with n=23, t23=%i   grouped=%i\n",t23,grouped));
  DBG(fprintf(stderr,"MIDSORTING\n"));
  for(i=0;i<grouped;i++){
    DBG(fprintf(stderr,"%2i of %2i: pkt=%p len=%03i n=%2i residue=%i\n",i,grouped,g_pkt[i],g_len[i],g_n[i],g_residue[i]));
    }

  // step 2: sort small packets in decreasing order of n (bubble sort is enough)
  for(i=t23;i<grouped;i++){
    for(j=i+1;j<grouped;j++){
      if(g_n[j]>g_n[i]){
        g_swap(i,j);
      }
    }
  }
  DBG(fprintf(stderr,"POSTSORTING\n"));
  for(i=0;i<grouped;i++){
    DBG(fprintf(stderr,"%2i of %2i: pkt=%p len=%03i n=%2i residue=%i\n",i,grouped,g_pkt[i],g_len[i],g_n[i],g_residue[i]));
    }

  // we need to know how many packets need 23 iterations, how many 22...
  for(i=0;i<=23;i++){
    alive[i]=0;
  }
  // count
  alive[23-1]=t23;
  for(i=t23;i<grouped;i++){
    alive[g_n[i]-1]++;
  }
  // integrate
  for(i=22;i>=0;i--){
    alive[i]+=alive[i+1];
  }
  DBG(fprintf(stderr,"ALIVE\n"));
  for(i=0;i<=23;i++){
    DBG(fprintf(stderr,"alive%2i=%i\n",i,alive[i]));
    }

  // choose key
  if(group_ev_od==0){
    k=&((struct csa_keys_t *)keys)->even;
  }
  else{
    k=&((struct csa_keys_t *)keys)->odd;
  }

  //INIT
//#define INITIALIZE_UNUSED_INPUT
#ifdef INITIALIZE_UNUSED_INPUT
// unnecessary zeroing.
// without this, we operate on uninitialized memory
// when grouped<GROUP_PARALLELISM, but it's not a problem,
// as final results will be discarded.
// random data makes debugging sessions difficult.
  for(j=0;j<GROUP_PARALLELISM*8;j++) stream_in[j]=0;
DBG(fprintf(stderr,"--- WARNING: you could gain speed by not initializing unused memory ---\n"));
#else
DBG(fprintf(stderr,"--- WARNING: DEBUGGING IS MORE DIFFICULT WHEN PROCESSING RANDOM DATA CHANGING AT EVERY RUN! ---\n"));
#endif

  for(g=0;g<grouped;g++){
    encp[g]=g_pkt[g];
    DBG(fprintf(stderr,"header[%i]=%p (%02x)\n",g,encp[g],*(encp[g])));
    encp[g]+=g_offset[g]; // skip header
    FFTABLEIN(stream_in,g,encp[g]);
  }
//dump_mem("stream_in",stream_in,GROUP_PARALLELISM*8,BYPG);


  // ITER 0
DBG(fprintf(stderr,">>>>>ITER 0\n"));
  iter=0;
  stream_cypher_group_init(&regs,k->iA_g,k->iB_g,stream_in);
  // fill first ib
  for(g=0;g<alive[iter];g++){
    COPY_8_BY(ib+8*g,encp[g]);
  }
DBG(dump_mem("IB ",ib,8*alive[iter],8));
  // ITER 1..N-1
  for (iter=1;iter<23&&alive[iter-1]>0;iter++){
DBG(fprintf(stderr,">>>>>ITER %i\n",iter));
    // alive and just dead packets: calc block
    block_decypher_group(k->kkmulti,ib,block_out,alive[iter-1]);
DBG(dump_mem("BLO_ib ",block_out,8*alive[iter-1],8));
    // all packets (dead too): calc stream
    stream_cypher_group_normal(&regs,stream_out);
//dump_mem("stream_out",stream_out,GROUP_PARALLELISM*8,BYPG);

    // alive packets: calc ib
    for(g=0;g<alive[iter];g++){
      FFTABLEOUT(ib+8*g,stream_out,g);
DBG(dump_mem("stream_out_ib ",ib+8*g,8,8));
// XOREQ8BY gcc bug? 2x4 ok, 8 ko    UPDATE: result ok but speed 1-2% slower (!!!???)
#if 1
      XOREQ_4_BY(ib+8*g,encp[g]+8);
      XOREQ_4_BY(ib+8*g+4,encp[g]+8+4);
#else
      XOREQ_8_BY(ib+8*g,encp[g]+8);
#endif
DBG(dump_mem("after_stream_xor_ib ",ib+8*g,8,8));
    }
    // alive packets: decrypt data
    for(g=0;g<alive[iter];g++){
DBG(dump_mem("before_ib_decrypt_data ",encp[g],8,8));
      XOR_8_BY(encp[g],ib+8*g,block_out+8*g);
DBG(dump_mem("after_ib_decrypt_data ",encp[g],8,8));
    }
    // just dead packets: write decrypted data
    for(g=alive[iter];g<alive[iter-1];g++){
DBG(dump_mem("jd_before_ib_decrypt_data ",encp[g],8,8));
      COPY_8_BY(encp[g],block_out+8*g);
DBG(dump_mem("jd_after_ib_decrypt_data ",encp[g],8,8));
    }
    // just dead packets: decrypt residue
    for(g=alive[iter];g<alive[iter-1];g++){
DBG(dump_mem("jd_before_decrypt_residue ",encp[g]+8,g_residue[g],g_residue[g]));
      FFTABLEOUTXORNBY(g_residue[g],encp[g]+8,stream_out,g);
DBG(dump_mem("jd_after_decrypt_residue ",encp[g]+8,g_residue[g],g_residue[g]));
    }
    // alive packets: pointers++
    for(g=0;g<alive[iter];g++) encp[g]+=8;
  };
  // ITER N
DBG(fprintf(stderr,">>>>>ITER 23\n"));
  iter=23;
  // calc block
  block_decypher_group(k->kkmulti,ib,block_out,alive[iter-1]);
DBG(dump_mem("23BLO_ib ",block_out,8*alive[iter-1],8));
  // just dead packets: write decrypted data
  for(g=alive[iter];g<alive[iter-1];g++){
DBG(dump_mem("23jd_before_ib_decrypt_data ",encp[g],8,8));
    COPY_8_BY(encp[g],block_out+8*g);
DBG(dump_mem("23jd_after_ib_decrypt_data ",encp[g],8,8));
  }
  // no residue possible
  // so do nothing

  DBG(fprintf(stderr,"returning advanced=%i\n",advanced));

  M_EMPTY(); // restore CPU multimedia state

  return advanced;
}