Imported Upstream version 4.51~dfsg.1

parent 107a0849
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#define kCrcPoly 0xEDB88320 #define kCrcPoly 0xEDB88320
UInt32 g_CrcTable[256]; UInt32 g_CrcTable[256];
void MY_FAST_CALL CrcGenerateTable() void MY_FAST_CALL CrcGenerateTable(void)
{ {
UInt32 i; UInt32 i;
for (i = 0; i < 256; i++) for (i = 0; i < 256; i++)
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
extern UInt32 g_CrcTable[]; extern UInt32 g_CrcTable[];
void MY_FAST_CALL CrcGenerateTable(); void MY_FAST_CALL CrcGenerateTable(void);
#define CRC_INIT_VAL 0xFFFFFFFF #define CRC_INIT_VAL 0xFFFFFFFF
#define CRC_GET_DIGEST(crc) ((crc) ^ 0xFFFFFFFF) #define CRC_GET_DIGEST(crc) ((crc) ^ 0xFFFFFFFF)
......
...@@ -27,7 +27,7 @@ typedef unsigned int UInt32; ...@@ -27,7 +27,7 @@ typedef unsigned int UInt32;
#ifdef _SZ_NO_INT_64 #ifdef _SZ_NO_INT_64
typedef unsigned long UInt64; typedef unsigned long UInt64;
#else #else
#ifdef _MSC_VER #if defined(_MSC_VER) || defined(__BORLANDC__)
typedef unsigned __int64 UInt64; typedef unsigned __int64 UInt64;
#else #else
typedef unsigned long long int UInt64; typedef unsigned long long int UInt64;
...@@ -35,4 +35,17 @@ typedef unsigned long long int UInt64; ...@@ -35,4 +35,17 @@ typedef unsigned long long int UInt64;
#endif #endif
#endif #endif
/* #define _LZMA_NO_SYSTEM_SIZE_T */
/* You can use it, if you don't want <stddef.h> */
#ifndef _7ZIP_SIZET_DEFINED
#define _7ZIP_SIZET_DEFINED
#ifdef _LZMA_NO_SYSTEM_SIZE_T
typedef UInt32 SizeT;
#else
#include <stddef.h>
typedef size_t SizeT;
#endif
#endif
#endif #endif
...@@ -2,100 +2,83 @@ ...@@ -2,100 +2,83 @@
#include "BranchX86.h" #include "BranchX86.h"
/*
static int inline Test86MSByte(Byte b)
{
return (b == 0 || b == 0xFF);
}
*/
#define Test86MSByte(b) ((b) == 0 || (b) == 0xFF) #define Test86MSByte(b) ((b) == 0 || (b) == 0xFF)
const int kMaskToAllowedStatus[8] = {1, 1, 1, 0, 1, 0, 0, 0}; const Byte kMaskToAllowedStatus[8] = {1, 1, 1, 0, 1, 0, 0, 0};
const Byte kMaskToBitNumber[8] = {0, 1, 2, 2, 3, 3, 3, 3}; const Byte kMaskToBitNumber[8] = {0, 1, 2, 2, 3, 3, 3, 3};
/* SizeT x86_Convert(Byte *buffer, SizeT endPos, UInt32 nowPos, UInt32 *prevMaskMix, int encoding)
void x86_Convert_Init(UInt32 *prevMask, UInt32 *prevPos)
{
*prevMask = 0;
*prevPos = (UInt32)(-5);
}
*/
UInt32 x86_Convert(Byte *buffer, UInt32 endPos, UInt32 nowPos,
UInt32 *prevMask, UInt32 *prevPos, int encoding)
{ {
UInt32 bufferPos = 0; SizeT bufferPos = 0, prevPosT;
UInt32 limit; UInt32 prevMask = *prevMaskMix & 0x7;
if (endPos < 5) if (endPos < 5)
return 0; return 0;
nowPos += 5;
if (nowPos - *prevPos > 5) prevPosT = (SizeT)0 - 1;
*prevPos = nowPos - 5;
for(;;)
limit = endPos - 5;
while(bufferPos <= limit)
{ {
Byte b = buffer[bufferPos]; Byte *p = buffer + bufferPos;
UInt32 offset; Byte *limit = buffer + endPos - 4;
if (b != 0xE8 && b != 0xE9) for (; p < limit; p++)
{ if ((*p & 0xFE) == 0xE8)
bufferPos++; break;
continue; bufferPos = (SizeT)(p - buffer);
} if (p >= limit)
offset = (nowPos + bufferPos - *prevPos); break;
*prevPos = (nowPos + bufferPos); prevPosT = bufferPos - prevPosT;
if (offset > 5) if (prevPosT > 3)
*prevMask = 0; prevMask = 0;
else else
{ {
UInt32 i; prevMask = (prevMask << ((int)prevPosT - 1)) & 0x7;
for (i = 0; i < offset; i++) if (prevMask != 0)
{ {
*prevMask &= 0x77; Byte b = p[4 - kMaskToBitNumber[prevMask]];
*prevMask <<= 1; if (!kMaskToAllowedStatus[prevMask] || Test86MSByte(b))
{
prevPosT = bufferPos;
prevMask = ((prevMask << 1) & 0x7) | 1;
bufferPos++;
continue;
}
} }
} }
b = buffer[bufferPos + 4]; prevPosT = bufferPos;
if (Test86MSByte(b) && kMaskToAllowedStatus[(*prevMask >> 1) & 0x7] &&
(*prevMask >> 1) < 0x10) if (Test86MSByte(p[4]))
{ {
UInt32 src = UInt32 src = ((UInt32)p[4] << 24) | ((UInt32)p[3] << 16) | ((UInt32)p[2] << 8) | ((UInt32)p[1]);
((UInt32)(b) << 24) |
((UInt32)(buffer[bufferPos + 3]) << 16) |
((UInt32)(buffer[bufferPos + 2]) << 8) |
(buffer[bufferPos + 1]);
UInt32 dest; UInt32 dest;
for (;;) for (;;)
{ {
UInt32 index; Byte b;
int index;
if (encoding) if (encoding)
dest = (nowPos + bufferPos + 5) + src; dest = (nowPos + (UInt32)bufferPos) + src;
else else
dest = src - (nowPos + bufferPos + 5); dest = src - (nowPos + (UInt32)bufferPos);
if (*prevMask == 0) if (prevMask == 0)
break; break;
index = kMaskToBitNumber[*prevMask >> 1]; index = kMaskToBitNumber[prevMask] * 8;
b = (Byte)(dest >> (24 - index * 8)); b = (Byte)(dest >> (24 - index));
if (!Test86MSByte(b)) if (!Test86MSByte(b))
break; break;
src = dest ^ ((1 << (32 - index * 8)) - 1); src = dest ^ ((1 << (32 - index)) - 1);
} }
buffer[bufferPos + 4] = (Byte)(~(((dest >> 24) & 1) - 1)); p[4] = (Byte)(~(((dest >> 24) & 1) - 1));
buffer[bufferPos + 3] = (Byte)(dest >> 16); p[3] = (Byte)(dest >> 16);
buffer[bufferPos + 2] = (Byte)(dest >> 8); p[2] = (Byte)(dest >> 8);
buffer[bufferPos + 1] = (Byte)dest; p[1] = (Byte)dest;
bufferPos += 5; bufferPos += 5;
*prevMask = 0;
} }
else else
{ {
prevMask = ((prevMask << 1) & 0x7) | 1;
bufferPos++; bufferPos++;
*prevMask |= 1;
if (Test86MSByte(b))
*prevMask |= 0x10;
} }
} }
prevPosT = bufferPos - prevPosT;
*prevMaskMix = ((prevPosT > 3) ? 0 : ((prevMask << ((int)prevPosT - 1)) & 0x7));
return bufferPos; return bufferPos;
} }
...@@ -5,9 +5,8 @@ ...@@ -5,9 +5,8 @@
#include "BranchTypes.h" #include "BranchTypes.h"
#define x86_Convert_Init(prevMask, prevPos) { prevMask = 0; prevPos = (UInt32)(-5); } #define x86_Convert_Init(state) { state = 0; }
UInt32 x86_Convert(Byte *buffer, UInt32 endPos, UInt32 nowPos, SizeT x86_Convert(Byte *buffer, SizeT endPos, UInt32 nowPos, UInt32 *state, int encoding);
UInt32 *prevMask, UInt32 *prevPos, int encoding);
#endif #endif
This diff is collapsed.
...@@ -45,8 +45,9 @@ typedef struct _CMatchFinder ...@@ -45,8 +45,9 @@ typedef struct _CMatchFinder
HRes result; HRes result;
} CMatchFinder; } CMatchFinder;
#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer + (p)->pos) #define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer)
#define Inline_MatchFinder_GetIndexByte(p, index) ((p)->buffer[(size_t)(p)->pos + (Int32)(index)]) #define Inline_MatchFinder_GetIndexByte(p, index) ((p)->buffer[(Int32)(index)])
#define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos) #define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos)
int MatchFinder_NeedMove(CMatchFinder *p); int MatchFinder_NeedMove(CMatchFinder *p);
......
...@@ -150,10 +150,10 @@ sometimes they use signed extending: (size_t)pos was compiled to "movsxd r10, ed ...@@ -150,10 +150,10 @@ sometimes they use signed extending: (size_t)pos was compiled to "movsxd r10, ed
*/ */
#define DEF_GetHeads(name, v) \ #define DEF_GetHeads(name, v) \
static void GetHeads ## name(const Byte *buffer, size_t pos, \ static void GetHeads ## name(const Byte *p, size_t pos, \
UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads) { \ UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads) { \
for (; numHeads != 0; numHeads--) { const Byte *p = buffer + (size_t)pos; \ for (; numHeads != 0; numHeads--) { \
const UInt32 value = (v); *heads++ = (UInt32)pos - hash[value]; hash[value] = (UInt32)(pos++); } } const UInt32 value = (v); p++; *heads++ = (UInt32)pos - hash[value]; hash[value] = (UInt32)(pos++); } }
DEF_GetHeads(2, (p[0] | ((UInt32)p[1] << 8)) & hashMask) DEF_GetHeads(2, (p[0] | ((UInt32)p[1] << 8)) & hashMask)
DEF_GetHeads(3, (g_CrcTable[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8)) & hashMask) DEF_GetHeads(3, (g_CrcTable[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8)) & hashMask)
...@@ -222,6 +222,7 @@ void HashThreadFunc(CMatchFinderMt *mt) ...@@ -222,6 +222,7 @@ void HashThreadFunc(CMatchFinderMt *mt)
heads[0] += num; heads[0] += num;
} }
mf->pos += num; mf->pos += num;
mf->buffer += num;
} }
} }
...@@ -252,7 +253,7 @@ void MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt *p) ...@@ -252,7 +253,7 @@ void MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt *p)
#endif #endif
#endif #endif
Int32 NO_INLINE GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *buffer, CLzRef *son, Int32 NO_INLINE GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *cur, CLzRef *son,
UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue,
UInt32 *_distances, UInt32 _maxLen, const UInt32 *hash, Int32 limit, UInt32 size, UInt32 *posRes) UInt32 *_distances, UInt32 _maxLen, const UInt32 *hash, Int32 limit, UInt32 size, UInt32 *posRes)
{ {
...@@ -276,14 +277,14 @@ Int32 NO_INLINE GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *buffer, ...@@ -276,14 +277,14 @@ Int32 NO_INLINE GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *buffer,
} }
{ {
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
const Byte *pb = buffer + curMatch; const Byte *pb = cur - delta;
const Byte *cur = buffer + pos;
UInt32 len = (len0 < len1 ? len0 : len1); UInt32 len = (len0 < len1 ? len0 : len1);
if (pb[len] == cur[len]) if (pb[len] == cur[len])
{ {
while(++len != lenLimit) if (++len != lenLimit && pb[len] == cur[len])
if (pb[len] != cur[len]) while(++len != lenLimit)
break; if (pb[len] != cur[len])
break;
if (maxLen < len) if (maxLen < len)
{ {
*distances++ = maxLen = len; *distances++ = maxLen = len;
...@@ -314,6 +315,7 @@ Int32 NO_INLINE GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *buffer, ...@@ -314,6 +315,7 @@ Int32 NO_INLINE GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *buffer,
} }
pos++; pos++;
_cyclicBufferPos++; _cyclicBufferPos++;
cur++;
{ {
UInt32 num = (UInt32)(distances - _distances); UInt32 num = (UInt32)(distances - _distances);
*_distances = num - 1; *_distances = num - 1;
...@@ -372,6 +374,7 @@ void BtGetMatches(CMatchFinderMt *p, UInt32 *distances) ...@@ -372,6 +374,7 @@ void BtGetMatches(CMatchFinderMt *p, UInt32 *distances)
curPos += num; curPos += num;
cyclicBufferPos++; cyclicBufferPos++;
pos++; pos++;
p->buffer++;
} }
#else #else
{ {
...@@ -380,6 +383,7 @@ void BtGetMatches(CMatchFinderMt *p, UInt32 *distances) ...@@ -380,6 +383,7 @@ void BtGetMatches(CMatchFinderMt *p, UInt32 *distances)
distances + curPos, p->numHashBytes - 1, p->hashBuf + p->hashBufPos, (Int32)(limit - curPos) , size, &posRes); distances + curPos, p->numHashBytes - 1, p->hashBuf + p->hashBufPos, (Int32)(limit - curPos) , size, &posRes);
p->hashBufPos += posRes - pos; p->hashBufPos += posRes - pos;
cyclicBufferPos += posRes - pos; cyclicBufferPos += posRes - pos;
p->buffer += posRes - pos;
pos = posRes; pos = posRes;
} }
#endif #endif
...@@ -411,7 +415,6 @@ void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex) ...@@ -411,7 +415,6 @@ void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex)
UInt32 subValue = p->pos - p->cyclicBufferSize; UInt32 subValue = p->pos - p->cyclicBufferSize;
MatchFinder_Normalize3(subValue, p->son, p->cyclicBufferSize * 2); MatchFinder_Normalize3(subValue, p->son, p->cyclicBufferSize * 2);
p->pos -= subValue; p->pos -= subValue;
p->buffer += subValue;
} }
if (!sync->needStart) if (!sync->needStart)
......
...@@ -71,7 +71,7 @@ typedef struct _CMatchFinderMt ...@@ -71,7 +71,7 @@ typedef struct _CMatchFinderMt
UInt32 matchMaxLen; UInt32 matchMaxLen;
UInt32 numHashBytes; UInt32 numHashBytes;
UInt32 pos; UInt32 pos;
Byte *buffer; /* Pointer to virtual Buffer begin */ Byte *buffer;
UInt32 cyclicBufferPos; UInt32 cyclicBufferPos;
UInt32 cyclicBufferSize; /* it must be historySize + 1 */ UInt32 cyclicBufferSize; /* it must be historySize + 1 */
UInt32 cutValue; UInt32 cutValue;
......
...@@ -29,16 +29,16 @@ typedef unsigned int UInt32; ...@@ -29,16 +29,16 @@ typedef unsigned int UInt32;
#endif #endif
#endif #endif
/* #define _LZMA_SYSTEM_SIZE_T */ /* #define _LZMA_NO_SYSTEM_SIZE_T */
/* Use system's size_t. You can use it to enable 64-bit sizes supporting */ /* You can use it, if you don't want <stddef.h> */
#ifndef _7ZIP_SIZET_DEFINED #ifndef _7ZIP_SIZET_DEFINED
#define _7ZIP_SIZET_DEFINED #define _7ZIP_SIZET_DEFINED
#ifdef _LZMA_SYSTEM_SIZE_T #ifdef _LZMA_NO_SYSTEM_SIZE_T
typedef UInt32 SizeT;
#else
#include <stddef.h> #include <stddef.h>
typedef size_t SizeT; typedef size_t SizeT;
#else
typedef UInt32 SizeT;
#endif #endif
#endif #endif
......
/* CpuArch.h */
#ifndef __CPUARCH_H
#define __CPUARCH_H
/*
LITTLE_ENDIAN_UNALIGN means:
1) CPU is LITTLE_ENDIAN
2) it's allowed to make unaligned memory accesses
if LITTLE_ENDIAN_UNALIGN is not defined, it means that we don't know
about these properties of platform.
*/
#if defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || defined(__i386__) || defined(__x86_64__)
#define LITTLE_ENDIAN_UNALIGN
#endif
#endif
/* Aes.h */
#include "Aes.h"
#include "../CpuArch.h"
UInt32 T[256 * 4];
Byte Sbox[256] = {
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
UInt32 D[256 * 4];
Byte InvS[256];
Byte Rcon[11] = { 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
#define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF)
#define Ui32(a0, a1, a2, a3) ((UInt32)(a0) | ((UInt32)(a1) << 8) | ((UInt32)(a2) << 16) | ((UInt32)(a3) << 24))
#define gb0(x) ( (x) & 0xFF)
#define gb1(x) (((x) >> ( 8)) & 0xFF)
#define gb2(x) (((x) >> (16)) & 0xFF)
#define gb3(x) (((x) >> (24)) & 0xFF)
void MY_FAST_CALL AesGenTables(void)
{
unsigned i;
for (i = 0; i < 256; i++)
InvS[Sbox[i]] = (Byte)i;
for (i = 0; i < 256; i++)
{
{
UInt32 a1 = Sbox[i];
UInt32 a2 = xtime(a1);
UInt32 a3 = xtime(a1) ^ a1;
T[ i] = Ui32(a2, a1, a1, a3);
T[0x100 + i] = Ui32(a3, a2, a1, a1);
T[0x200 + i] = Ui32(a1, a3, a2, a1);
T[0x300 + i] = Ui32(a1, a1, a3, a2);
}
{
UInt32 a1 = InvS[i];
UInt32 a2 = xtime(a1);
UInt32 a4 = xtime(a2);
UInt32 a8 = xtime(a4);
UInt32 a9 = a8 ^ a1;
UInt32 aB = a8 ^ a2 ^ a1;
UInt32 aD = a8 ^ a4 ^ a1;
UInt32 aE = a8 ^ a4 ^ a2;
D[ i] = Ui32(aE, a9, aD, aB);
D[0x100 + i] = Ui32(aB, aE, a9, aD);
D[0x200 + i] = Ui32(aD, aB, aE, a9);
D[0x300 + i] = Ui32(a9, aD, aB, aE);
}
}
}
#define HT(i, x, s) (T + (x << 8))[gb ## x(s[(i + x) & 3])]
#define HT4(m, i, s, p) m[i] = \
HT(i, 0, s) ^ \
HT(i, 1, s) ^ \
HT(i, 2, s) ^ \
HT(i, 3, s) ^ w[p + i]
/* such order (2031) in HT16 is for VC6/K8 speed optimization) */
#define HT16(m, s, p) \
HT4(m, 2, s, p); \
HT4(m, 0, s, p); \
HT4(m, 3, s, p); \
HT4(m, 1, s, p); \
#define FT(i, x) Sbox[gb ## x(m[(i + x) & 3])]
#define FT4(i) dest[i] = Ui32(FT(i, 0), FT(i, 1), FT(i, 2), FT(i, 3)) ^ w[i];
#define HD(i, x, s) (D + (x << 8))[gb ## x(s[(i - x) & 3])]
#define HD4(m, i, s, p) m[i] = \
HD(i, 0, s) ^ \
HD(i, 1, s) ^ \
HD(i, 2, s) ^ \
HD(i, 3, s) ^ w[p + i];
/* such order (0231) in HD16 is for VC6/K8 speed optimization) */
#define HD16(m, s, p) \
HD4(m, 0, s, p); \
HD4(m, 2, s, p); \
HD4(m, 3, s, p); \
HD4(m, 1, s, p); \
#define FD(i, x) InvS[gb ## x(m[(i - x) & 3])]
#define FD4(i) dest[i] = Ui32(FD(i, 0), FD(i, 1), FD(i, 2), FD(i, 3)) ^ w[i];
void MY_FAST_CALL AesSetKeyEncode(CAes *p, const Byte *key, unsigned keySize)
{
unsigned i, wSize;
UInt32 *w;
keySize /= 4;
p->numRounds2 = keySize / 2 + 3;
wSize = (p->numRounds2 * 2 + 1) * 4;
w = p->rkey;
for (i = 0; i < keySize; i++, key += 4)
w[i] = Ui32(key[0], key[1], key[2], key[3]);
for (; i < wSize; i++)
{
UInt32 t = w[i - 1];
unsigned rem = i % keySize;
if (rem == 0)
t = Ui32(Sbox[gb1(t)] ^ Rcon[i / keySize], Sbox[gb2(t)], Sbox[gb3(t)], Sbox[gb0(t)]);
else if (keySize > 6 && rem == 4)
t = Ui32(Sbox[gb0(t)], Sbox[gb1(t)], Sbox[gb2(t)], Sbox[gb3(t)]);
w[i] = w[i - keySize] ^ t;
}
}
void MY_FAST_CALL AesSetKeyDecode(CAes *p, const Byte *key, unsigned keySize)
{
unsigned i, num;
UInt32 *w;
AesSetKeyEncode(p, key, keySize);
num = p->numRounds2 * 8 - 4;
w = p->rkey + 4;
for (i = 0; i < num; i++)
{
UInt32 r = w[i];
w[i] =
D[ Sbox[gb0(r)]] ^
D[0x100 + Sbox[gb1(r)]] ^
D[0x200 + Sbox[gb2(r)]] ^
D[0x300 + Sbox[gb3(r)]];
}