diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1ce4443 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.* +*.o +*.exe +*.sln +*.vcxproj* +x64/ diff --git a/Makefile b/Makefile index 7f59570..0ffe5bd 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # only tested in mingw PNAME = bfcl -OBJS = $(PNAME).o ocl_util.o utils.o sha1_16.o aes128.o -CFLAGS += -std=c11 -Wall -O2 -I$(INTELOCLSDKROOT)/include +OBJS = $(PNAME).o ocl_util.o utils.o sha1_16.o aes_128.o +CFLAGS += -std=c11 -Wall -O2 -mrdrnd -I$(INTELOCLSDKROOT)/include LDFLAGS += -L$(INTELOCLSDKROOT)/lib/x64 all : $(PNAME) diff --git a/aes128.c b/aes_128.c similarity index 92% rename from aes128.c rename to aes_128.c index b08629f..8753714 100644 --- a/aes128.c +++ b/aes_128.c @@ -19,21 +19,19 @@ #define PUT_UINT32_LE(n, b, i) \ *(uint32_t*)(b + i) = (n) -// packed into a struct to be easier to pass to OpenCL // it's interesting they mix unsigned char with uint32_t -AES_Tables AES_tables; -// and aliases -static unsigned char *FSb = AES_tables.FSb; -static uint32_t *FT0 = AES_tables.FT0; -static uint32_t *FT1 = AES_tables.FT1; -static uint32_t *FT2 = AES_tables.FT2; -static uint32_t *FT3 = AES_tables.FT3; -static unsigned char *RSb = AES_tables.RSb; -static uint32_t *RT0 = AES_tables.RT0; -static uint32_t *RT1 = AES_tables.RT1; -static uint32_t *RT2 = AES_tables.RT2; -static uint32_t *RT3 = AES_tables.RT3; -static uint32_t *RCON = AES_tables.RCON; +static unsigned char FSb[256]; +static uint32_t FT0[256]; +static uint32_t FT1[256]; +static uint32_t FT2[256]; +static uint32_t FT3[256]; +static unsigned char RSb[256]; +static uint32_t RT0[256]; +static uint32_t RT1[256]; +static uint32_t RT2[256]; +static uint32_t RT3[256]; + +static uint32_t RCON[256]; /* * Tables generation code diff --git a/bfcl.c b/bfcl.c index f59e2e9..2fb5d83 100644 --- a/bfcl.c +++ b/bfcl.c @@ -2,39 +2,72 @@ #include #include #include +#include #include "ocl.h" #include "crypto.h" #include "utils.h" -extern AES_Tables AES_tables; +#ifdef __GNUC__ +#include +#elif _MSC_VER +#include +#endif + +int cpu_has_rdrand() { +#if __GNUC__ + unsigned a = 0, b = 0, c = 0, d = 0; + __get_cpuid(1, &a, &b, &c, &d); + return c & bit_RDRND; +#elif _MSC_VER + int regs[4]; + __cpuid(regs, 1); + return regs[2] & (1<<30); +#else + // ICL only? + return _may_i_use_cpu_feature(_FEATURE_RDRND); +#endif +} // CAUTION: caller is responsible to free the buf -char * read_source(const char *file_name) { +char * read_file(const char *file_name, size_t *p_size) { FILE * f = fopen(file_name, "rb"); if (f == NULL) { printf("can't read file: %s", file_name); exit(-1); } fseek(f, 0, SEEK_END); - long size = ftell(f); - char * buf = malloc(size + 1); + *p_size = ftell(f); + char * buf = malloc(*p_size); fseek(f, 0, SEEK_SET); - fread(buf, size, 1, f); + fread(buf, *p_size, 1, f); fclose(f); - buf[size] = '\0'; return buf; } -#define TEST_SHA1_16 1 -#define TEST_AES_128_ECB 2 +void read_files(unsigned num_files, const char *file_names[], char *ptrs[], size_t sizes[]) { + for (unsigned i = 0; i < num_files; ++i) { + ptrs[i] = read_file(file_names[i], &sizes[i]); + } +} -void ocl_test(int test_case) { - cl_platform_id platform_id; - cl_device_id device_id; - ocl_get_device(&platform_id, &device_id); - if (platform_id == NULL || device_id == NULL) { +void dump_to_file(const char *file_name, const void *buf, size_t len) { + FILE *f = fopen(file_name, "wb"); + if (f == NULL) { + printf("can't open file to write: %s\n", file_name); return; } + fwrite(buf, len, 1, f); + fclose(f); +} + +#define TEST_SHA1_16 1 +#define TEST_AES_128_ECB 2 + +#define BLOCK_SIZE 0x10 +#define NUM_BLOCKS (1 << 23) +#define BLOCKS_PER_ITEM 1 + +void ocl_test(cl_device_id device_id, const cl_uchar *buf_in, int test_case) { cl_int err; cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err)); cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err)); @@ -42,23 +75,27 @@ void ocl_test(int test_case) { HP_Time t0, t1; long long td; -#define BLOCK_SIZE 0x10 -#define NUM_BLOCKS (1 << 24) -#define BLOCKS_PER_ITEM 1 - const size_t num_items = NUM_BLOCKS / BLOCKS_PER_ITEM; const size_t io_buf_len = NUM_BLOCKS * BLOCK_SIZE; - char *source = read_source("kernel.c"); + const char *source_names[] = { "cl/sha1_16.cl", "cl/aes_tables.cl", "cl/aes_128.cl", "cl/kernels.cl" }; + const unsigned num_sources = sizeof(source_names) / sizeof(char *); + char *sources[sizeof(source_names) / sizeof(char *)]; + size_t source_sizes[sizeof(source_names) / sizeof(char *)]; + read_files(num_sources, source_names, sources, source_sizes); + get_hp_time(&t0); - cl_program program = OCL_ASSERT2(clCreateProgramWithSource(context, 1, (const char**)&source, NULL, &err)); + // WTF? GCC complains if I pass char ** in to a function expecting const char **? + cl_program program = OCL_ASSERT2(clCreateProgramWithSource(context, num_sources, (const char **)sources, source_sizes, &err)); char options[0x100]; - sprintf(options, "-DBLOCKS_PER_ITEM=%d", BLOCKS_PER_ITEM); - printf("compiler options: %s\n", options); + sprintf(options, "-w -Werror -DBLOCKS_PER_ITEM=%d", BLOCKS_PER_ITEM); + // printf("compiler options: %s\n", options); err = clBuildProgram(program, 0, NULL, options, NULL, NULL); get_hp_time(&t1); printf("%d microseconds for compile\n", (int)hp_time_diff(&t0, &t1)); - free(source); + for (unsigned i = 0; i < num_sources; ++i) { + free(sources[i]); + } if (err != CL_SUCCESS) { fprintf(stderr, "failed to build program, error: %s, build log:\n", ocl_err_msg(err)); size_t len; @@ -78,50 +115,40 @@ void ocl_test(int test_case) { default: exit(-1); } - printf("%s on %u bytes\n", test_name, (unsigned)io_buf_len); + printf("%s on %u MB\n", test_name, (unsigned)io_buf_len >> 20); cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err)); - cl_uchar *buf_in = malloc(io_buf_len); cl_uchar key[16]; unsigned int aes_rk[RK_LEN]; - get_hp_time(&t0); - srand(2501); - for (unsigned i = 0; i < io_buf_len; ++i) { - buf_in[i] = ((unsigned)rand() << 8) / RAND_MAX; - } if (test_case == TEST_AES_128_ECB) { aes_gen_tables(); for (unsigned i = 0; i < 16; ++i) { - key[i] = ((unsigned)rand() << 8) / RAND_MAX; + key[i] = rand() & 0xff; } - aes_set_key_enc_128(aes_rk, key); + printf("Key: %s\n", hexdump(key, 16, 0)); } - get_hp_time(&t1); - printf("%d microseconds for preparing test data\n", (int)hp_time_diff(&t0, &t1)); cl_mem mem_in = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, io_buf_len, NULL, &err)); cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, io_buf_len, NULL, &err)); - cl_mem mem_tables, mem_key; + cl_mem mem_key; if (test_case == TEST_AES_128_ECB) { - mem_tables = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(AES_Tables), NULL, &err)); mem_key = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, 16, NULL, &err)); } get_hp_time(&t0); OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_in, CL_TRUE, 0, io_buf_len, buf_in, 0, NULL, NULL)); if (test_case == TEST_AES_128_ECB) { - OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_tables, CL_TRUE, 0, sizeof(AES_Tables), &AES_tables, 0, NULL, NULL)); OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_key, CL_TRUE, 0, 16, key, 0, NULL, NULL)); } get_hp_time(&t1); - printf("%d microseconds for data upload\n", (int)hp_time_diff(&t0, &t1)); + td = hp_time_diff(&t0, &t1); + printf("%d microseconds for data upload, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td); OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_in)); OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_out)); if (test_case == TEST_AES_128_ECB) { - OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_tables)); - OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_mem), &mem_key)); + OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_key)); } size_t local; @@ -129,7 +156,7 @@ void ocl_test(int test_case) { printf("local work size: %u\n", (unsigned)local); get_hp_time(&t0); - // appearantly, settiing local work size to NULL doesn't affect performance, at least in this kind of work + // apparently, setting local work size to NULL doesn't affect performance, at least in this kind of work OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL)); // OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, NULL, 0, NULL, NULL)); clFinish(command_queue); @@ -141,28 +168,42 @@ void ocl_test(int test_case) { get_hp_time(&t0); OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, io_buf_len, buf_out, 0, NULL, NULL)); get_hp_time(&t1); - printf("%d microseconds for data download\n", (int)hp_time_diff(&t0, &t1)); + td = hp_time_diff(&t0, &t1); + printf("%d microseconds for data download, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td); + /* + if(test_case == TEST_AES_128_ECB){ + dump_to_file("r:/test_aes_in.bin", buf_in, io_buf_len); + dump_to_file("r:/test_aes_out.bin", buf_out, io_buf_len); + } + */ + + cl_uchar *buf_verify = malloc(io_buf_len); get_hp_time(&t0); if (test_case == TEST_SHA1_16) { for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) { - sha1_16(buf_in + offset, buf_in + offset); + sha1_16(buf_in + offset, buf_verify + offset); } } else { for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) { - aes_encrypt_128(aes_rk, buf_in + offset, buf_in + offset); + // setting the same key over and over is stupid + // yet we still do it to keep it in line with the OpenCL port + // otherwise we can't test the set key in OpenCL + aes_set_key_enc_128(aes_rk, key); + aes_encrypt_128(aes_rk, buf_in + offset, buf_verify + offset); } } get_hp_time(&t1); td = hp_time_diff(&t0, &t1); - printf("%d microseconds for C(single thread), %.3f MB/s\n", (int)td, io_buf_len * 1.0f / td); + printf("%d microseconds for C(single thread), %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td); - if (memcmp(buf_in, buf_out, io_buf_len)) { + if (memcmp(buf_verify, buf_out, io_buf_len)) { printf("%s: verification failed\n", test_name); for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) { - if (memcmp(buf_in + offset, buf_out + offset, BLOCK_SIZE)) { + if (memcmp(buf_verify + offset, buf_out + offset, BLOCK_SIZE)) { printf("first difference @ 0x%08x/0x%08x:\n", offset, (unsigned)num_items ); printf("\t%s\n", hexdump(buf_in + offset, BLOCK_SIZE, 0)); + printf("\t%s\n", hexdump(buf_verify + offset, BLOCK_SIZE, 0)); printf("\t%s\n", hexdump(buf_out + offset, BLOCK_SIZE, 0)); break; } @@ -171,7 +212,7 @@ void ocl_test(int test_case) { printf("%s: succeed\n", test_name); } - free(buf_in); free(buf_out); + free(buf_out); clReleaseMemObject(mem_in); clReleaseMemObject(mem_out); clReleaseProgram(program); @@ -180,32 +221,65 @@ void ocl_test(int test_case) { clReleaseContext(context); } -void aes128_test() { - // TODO: test against OpenSSL results - unsigned char test_key[16] = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 }; - unsigned char test_src[32] = { 8, 7, 6, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5, 6, 7, 8, - 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1 }; - unsigned char test_out[32]; - - aes_gen_tables(); - - unsigned aes_rk[RK_LEN]; - - aes_set_key_enc_128(aes_rk, test_key); - aes_encrypt_128(aes_rk, test_src, test_out); - aes_encrypt_128(aes_rk, test_src + 16, test_out + 16); - puts(hexdump(test_out, 32, 1)); -} - int main(int argc, const char *argv[]) { if (argc == 2 && !strcmp(argv[1], "info")) { cl_uint num_platforms; ocl_info(&num_platforms, 1); - }else if (argc == 2 && !strcmp(argv[1], "aes128ecb")) { - aes128_test(); } else if (argc == 1){ - ocl_test(TEST_SHA1_16); - ocl_test(TEST_AES_128_ECB); + cl_platform_id platform_id; + cl_device_id device_id; + ocl_get_device(&platform_id, &device_id); + if (platform_id == NULL || device_id == NULL) { + return -1; + } + + cl_uchar *buf_in = malloc(BLOCK_SIZE * NUM_BLOCKS); + srand(2501); + HP_Time t0, t1; long long td; + get_hp_time(&t0); + if(cpu_has_rdrand()){ + // ~190 MB/s @ X230, ~200 without the success check + printf("randomize source buffer using RDRAND\n"); + unsigned long long *p = (unsigned long long *)buf_in; + unsigned long long *p_end = (unsigned long long *)(buf_in + BLOCK_SIZE * NUM_BLOCKS); + int success = 1; + while (p < p_end) { + success &= _rdrand64_step(p++); + } + if (!success) { + printf("RDRND failed\n"); + exit(-1); + } + }else { + printf("randomize source buffer using AES OFB\n"); + // rand() & 0xff is about ~60 MB/s @ X230 + // it's worse than that AES single thread C, so OFB it is + // ~240 MB/s, even faster than RDRAND + srand(2501); + unsigned int aes_rk[RK_LEN]; + unsigned char key_iv[16 * 2]; + for (unsigned i = 0; i < 16 * 2; ++i) { + key_iv[i] = rand() & 0xff; + } + aes_set_key_enc_128(aes_rk, key_iv); + aes_encrypt_128(aes_rk, key_iv + 16, buf_in); + unsigned char *p_in = buf_in, *p_out = buf_in + 16, + *p_end = buf_in + BLOCK_SIZE * NUM_BLOCKS; + while (p_out < p_end) { + aes_encrypt_128(aes_rk, p_in, p_out); + p_in = p_out; + p_out += 16; + } + } + get_hp_time(&t1); + td = hp_time_diff(&t0, &t1); + printf("%d microseconds for preparing test data, %.2f MB/s\n", + (int)td, BLOCK_SIZE * NUM_BLOCKS * 1.0f / td); + + ocl_test(device_id, buf_in, TEST_SHA1_16); + ocl_test(device_id, buf_in, TEST_AES_128_ECB); + + free(buf_in); #ifdef _WIN32 system("pause"); #endif diff --git a/cl/aes_128.cl b/cl/aes_128.cl new file mode 100644 index 0000000..1da8a1e --- /dev/null +++ b/cl/aes_128.cl @@ -0,0 +1,106 @@ + +// OpenCL has these fancy address space qualifiers that can't be cast without +#define GET_UINT32_LE(n, b, i) \ + (n) = *(uint32_t*)(b + i) +#define GET_UINT32_LE_G(n, b, i) \ + (n) = *(__global uint32_t*)(b + i) +#define GET_UINT32_LE_C(n, b, i) \ + (n) = *(__constant uint32_t*)(b + i) +#define PUT_UINT32_LE(n, b, i) \ + *(uint32_t*)(b + i) = (n) +#define PUT_UINT32_LE_G(n, b, i) \ + *(__global uint32_t*)(b + i) = (n) + +#define RK_LEN 44 + +void aes_set_key_enc_128(uint32_t rk[RK_LEN]) +{ + uint32_t *RK = rk; + + for (unsigned i = 0; i < 10; ++i, RK += 4) { + RK[4] = RK[0] ^ RCON[i] ^ + ( (uint32_t) FSb[ ( RK[3] >> 8 ) & 0xFF ] ) ^ + ( (uint32_t) FSb[ ( RK[3] >> 16 ) & 0xFF ] << 8 ) ^ + ( (uint32_t) FSb[ ( RK[3] >> 24 ) & 0xFF ] << 16 ) ^ + ( (uint32_t) FSb[ ( RK[3] ) & 0xFF ] << 24 ); + + RK[5] = RK[1] ^ RK[4]; + RK[6] = RK[2] ^ RK[5]; + RK[7] = RK[3] ^ RK[6]; + } +} + +#define AES_FROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3) \ +{ \ + X0 = *RK++ ^ FT0[ ( Y0 ) & 0xFF ] ^ \ + FT1[ ( Y1 >> 8 ) & 0xFF ] ^ \ + FT2[ ( Y2 >> 16 ) & 0xFF ] ^ \ + FT3[ ( Y3 >> 24 ) & 0xFF ]; \ + \ + X1 = *RK++ ^ FT0[ ( Y1 ) & 0xFF ] ^ \ + FT1[ ( Y2 >> 8 ) & 0xFF ] ^ \ + FT2[ ( Y3 >> 16 ) & 0xFF ] ^ \ + FT3[ ( Y0 >> 24 ) & 0xFF ]; \ + \ + X2 = *RK++ ^ FT0[ ( Y2 ) & 0xFF ] ^ \ + FT1[ ( Y3 >> 8 ) & 0xFF ] ^ \ + FT2[ ( Y0 >> 16 ) & 0xFF ] ^ \ + FT3[ ( Y1 >> 24 ) & 0xFF ]; \ + \ + X3 = *RK++ ^ FT0[ ( Y3 ) & 0xFF ] ^ \ + FT1[ ( Y0 >> 8 ) & 0xFF ] ^ \ + FT2[ ( Y1 >> 16 ) & 0xFF ] ^ \ + FT3[ ( Y2 >> 24 ) & 0xFF ]; \ +} + +void aes_encrypt_128(const uint32_t rk[RK_LEN], + const uint32_t *in, uint32_t *out) +{ + const uint32_t *RK = rk; + uint32_t X0 = in[0], X1 = in[1], X2 = in[2], X3 = in[3], + Y0, Y1, Y2, Y3; + + X0 ^= *RK++; + X1 ^= *RK++; + X2 ^= *RK++; + X3 ^= *RK++; + + AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); + AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 ); + AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); + AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 ); + AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); + AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 ); + AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); + AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 ); + AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); + + X0 = *RK++ ^ \ + ( (uint32_t) FSb[ ( Y0 ) & 0xFF ] ) ^ + ( (uint32_t) FSb[ ( Y1 >> 8 ) & 0xFF ] << 8 ) ^ + ( (uint32_t) FSb[ ( Y2 >> 16 ) & 0xFF ] << 16 ) ^ + ( (uint32_t) FSb[ ( Y3 >> 24 ) & 0xFF ] << 24 ); + + X1 = *RK++ ^ \ + ( (uint32_t) FSb[ ( Y1 ) & 0xFF ] ) ^ + ( (uint32_t) FSb[ ( Y2 >> 8 ) & 0xFF ] << 8 ) ^ + ( (uint32_t) FSb[ ( Y3 >> 16 ) & 0xFF ] << 16 ) ^ + ( (uint32_t) FSb[ ( Y0 >> 24 ) & 0xFF ] << 24 ); + + X2 = *RK++ ^ \ + ( (uint32_t) FSb[ ( Y2 ) & 0xFF ] ) ^ + ( (uint32_t) FSb[ ( Y3 >> 8 ) & 0xFF ] << 8 ) ^ + ( (uint32_t) FSb[ ( Y0 >> 16 ) & 0xFF ] << 16 ) ^ + ( (uint32_t) FSb[ ( Y1 >> 24 ) & 0xFF ] << 24 ); + + X3 = *RK ^ \ + ( (uint32_t) FSb[ ( Y3 ) & 0xFF ] ) ^ + ( (uint32_t) FSb[ ( Y0 >> 8 ) & 0xFF ] << 8 ) ^ + ( (uint32_t) FSb[ ( Y1 >> 16 ) & 0xFF ] << 16 ) ^ + ( (uint32_t) FSb[ ( Y2 >> 24 ) & 0xFF ] << 24 ); + + out[0] = X0; + out[1] = X1; + out[2] = X2; + out[3] = X3; +} diff --git a/cl/aes_tables.cl b/cl/aes_tables.cl new file mode 100644 index 0000000..c2e65ae --- /dev/null +++ b/cl/aes_tables.cl @@ -0,0 +1,137 @@ + +/* + * Forward S-box + */ +__constant static const unsigned char FSb[256] = +{ + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, + 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, + 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, + 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, + 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, + 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, + 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, + 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, + 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, + 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, + 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, + 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, + 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, + 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, + 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, + 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, + 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16 +}; + +/* + * Forward tables + */ +#define FT \ +\ + V(A5,63,63,C6), V(84,7C,7C,F8), V(99,77,77,EE), V(8D,7B,7B,F6), \ + V(0D,F2,F2,FF), V(BD,6B,6B,D6), V(B1,6F,6F,DE), V(54,C5,C5,91), \ + V(50,30,30,60), V(03,01,01,02), V(A9,67,67,CE), V(7D,2B,2B,56), \ + V(19,FE,FE,E7), V(62,D7,D7,B5), V(E6,AB,AB,4D), V(9A,76,76,EC), \ + V(45,CA,CA,8F), V(9D,82,82,1F), V(40,C9,C9,89), V(87,7D,7D,FA), \ + V(15,FA,FA,EF), V(EB,59,59,B2), V(C9,47,47,8E), V(0B,F0,F0,FB), \ + V(EC,AD,AD,41), V(67,D4,D4,B3), V(FD,A2,A2,5F), V(EA,AF,AF,45), \ + V(BF,9C,9C,23), V(F7,A4,A4,53), V(96,72,72,E4), V(5B,C0,C0,9B), \ + V(C2,B7,B7,75), V(1C,FD,FD,E1), V(AE,93,93,3D), V(6A,26,26,4C), \ + V(5A,36,36,6C), V(41,3F,3F,7E), V(02,F7,F7,F5), V(4F,CC,CC,83), \ + V(5C,34,34,68), V(F4,A5,A5,51), V(34,E5,E5,D1), V(08,F1,F1,F9), \ + V(93,71,71,E2), V(73,D8,D8,AB), V(53,31,31,62), V(3F,15,15,2A), \ + V(0C,04,04,08), V(52,C7,C7,95), V(65,23,23,46), V(5E,C3,C3,9D), \ + V(28,18,18,30), V(A1,96,96,37), V(0F,05,05,0A), V(B5,9A,9A,2F), \ + V(09,07,07,0E), V(36,12,12,24), V(9B,80,80,1B), V(3D,E2,E2,DF), \ + V(26,EB,EB,CD), V(69,27,27,4E), V(CD,B2,B2,7F), V(9F,75,75,EA), \ + V(1B,09,09,12), V(9E,83,83,1D), V(74,2C,2C,58), V(2E,1A,1A,34), \ + V(2D,1B,1B,36), V(B2,6E,6E,DC), V(EE,5A,5A,B4), V(FB,A0,A0,5B), \ + V(F6,52,52,A4), V(4D,3B,3B,76), V(61,D6,D6,B7), V(CE,B3,B3,7D), \ + V(7B,29,29,52), V(3E,E3,E3,DD), V(71,2F,2F,5E), V(97,84,84,13), \ + V(F5,53,53,A6), V(68,D1,D1,B9), V(00,00,00,00), V(2C,ED,ED,C1), \ + V(60,20,20,40), V(1F,FC,FC,E3), V(C8,B1,B1,79), V(ED,5B,5B,B6), \ + V(BE,6A,6A,D4), V(46,CB,CB,8D), V(D9,BE,BE,67), V(4B,39,39,72), \ + V(DE,4A,4A,94), V(D4,4C,4C,98), V(E8,58,58,B0), V(4A,CF,CF,85), \ + V(6B,D0,D0,BB), V(2A,EF,EF,C5), V(E5,AA,AA,4F), V(16,FB,FB,ED), \ + V(C5,43,43,86), V(D7,4D,4D,9A), V(55,33,33,66), V(94,85,85,11), \ + V(CF,45,45,8A), V(10,F9,F9,E9), V(06,02,02,04), V(81,7F,7F,FE), \ + V(F0,50,50,A0), V(44,3C,3C,78), V(BA,9F,9F,25), V(E3,A8,A8,4B), \ + V(F3,51,51,A2), V(FE,A3,A3,5D), V(C0,40,40,80), V(8A,8F,8F,05), \ + V(AD,92,92,3F), V(BC,9D,9D,21), V(48,38,38,70), V(04,F5,F5,F1), \ + V(DF,BC,BC,63), V(C1,B6,B6,77), V(75,DA,DA,AF), V(63,21,21,42), \ + V(30,10,10,20), V(1A,FF,FF,E5), V(0E,F3,F3,FD), V(6D,D2,D2,BF), \ + V(4C,CD,CD,81), V(14,0C,0C,18), V(35,13,13,26), V(2F,EC,EC,C3), \ + V(E1,5F,5F,BE), V(A2,97,97,35), V(CC,44,44,88), V(39,17,17,2E), \ + V(57,C4,C4,93), V(F2,A7,A7,55), V(82,7E,7E,FC), V(47,3D,3D,7A), \ + V(AC,64,64,C8), V(E7,5D,5D,BA), V(2B,19,19,32), V(95,73,73,E6), \ + V(A0,60,60,C0), V(98,81,81,19), V(D1,4F,4F,9E), V(7F,DC,DC,A3), \ + V(66,22,22,44), V(7E,2A,2A,54), V(AB,90,90,3B), V(83,88,88,0B), \ + V(CA,46,46,8C), V(29,EE,EE,C7), V(D3,B8,B8,6B), V(3C,14,14,28), \ + V(79,DE,DE,A7), V(E2,5E,5E,BC), V(1D,0B,0B,16), V(76,DB,DB,AD), \ + V(3B,E0,E0,DB), V(56,32,32,64), V(4E,3A,3A,74), V(1E,0A,0A,14), \ + V(DB,49,49,92), V(0A,06,06,0C), V(6C,24,24,48), V(E4,5C,5C,B8), \ + V(5D,C2,C2,9F), V(6E,D3,D3,BD), V(EF,AC,AC,43), V(A6,62,62,C4), \ + V(A8,91,91,39), V(A4,95,95,31), V(37,E4,E4,D3), V(8B,79,79,F2), \ + V(32,E7,E7,D5), V(43,C8,C8,8B), V(59,37,37,6E), V(B7,6D,6D,DA), \ + V(8C,8D,8D,01), V(64,D5,D5,B1), V(D2,4E,4E,9C), V(E0,A9,A9,49), \ + V(B4,6C,6C,D8), V(FA,56,56,AC), V(07,F4,F4,F3), V(25,EA,EA,CF), \ + V(AF,65,65,CA), V(8E,7A,7A,F4), V(E9,AE,AE,47), V(18,08,08,10), \ + V(D5,BA,BA,6F), V(88,78,78,F0), V(6F,25,25,4A), V(72,2E,2E,5C), \ + V(24,1C,1C,38), V(F1,A6,A6,57), V(C7,B4,B4,73), V(51,C6,C6,97), \ + V(23,E8,E8,CB), V(7C,DD,DD,A1), V(9C,74,74,E8), V(21,1F,1F,3E), \ + V(DD,4B,4B,96), V(DC,BD,BD,61), V(86,8B,8B,0D), V(85,8A,8A,0F), \ + V(90,70,70,E0), V(42,3E,3E,7C), V(C4,B5,B5,71), V(AA,66,66,CC), \ + V(D8,48,48,90), V(05,03,03,06), V(01,F6,F6,F7), V(12,0E,0E,1C), \ + V(A3,61,61,C2), V(5F,35,35,6A), V(F9,57,57,AE), V(D0,B9,B9,69), \ + V(91,86,86,17), V(58,C1,C1,99), V(27,1D,1D,3A), V(B9,9E,9E,27), \ + V(38,E1,E1,D9), V(13,F8,F8,EB), V(B3,98,98,2B), V(33,11,11,22), \ + V(BB,69,69,D2), V(70,D9,D9,A9), V(89,8E,8E,07), V(A7,94,94,33), \ + V(B6,9B,9B,2D), V(22,1E,1E,3C), V(92,87,87,15), V(20,E9,E9,C9), \ + V(49,CE,CE,87), V(FF,55,55,AA), V(78,28,28,50), V(7A,DF,DF,A5), \ + V(8F,8C,8C,03), V(F8,A1,A1,59), V(80,89,89,09), V(17,0D,0D,1A), \ + V(DA,BF,BF,65), V(31,E6,E6,D7), V(C6,42,42,84), V(B8,68,68,D0), \ + V(C3,41,41,82), V(B0,99,99,29), V(77,2D,2D,5A), V(11,0F,0F,1E), \ + V(CB,B0,B0,7B), V(FC,54,54,A8), V(D6,BB,BB,6D), V(3A,16,16,2C) + +#define V(a,b,c,d) 0x##a##b##c##d +__constant static const uint32_t FT0[256] = { FT }; +#undef V + +#define V(a,b,c,d) 0x##b##c##d##a +__constant static const uint32_t FT1[256] = { FT }; +#undef V + +#define V(a,b,c,d) 0x##c##d##a##b +__constant static const uint32_t FT2[256] = { FT }; +#undef V + +#define V(a,b,c,d) 0x##d##a##b##c +__constant static const uint32_t FT3[256] = { FT }; +#undef V + +#undef FT + +/* + * Round constants + */ +__constant static const uint32_t RCON[10] = +{ + 0x00000001, 0x00000002, 0x00000004, 0x00000008, + 0x00000010, 0x00000020, 0x00000040, 0x00000080, + 0x0000001B, 0x00000036 +}; diff --git a/cl/kernels.cl b/cl/kernels.cl new file mode 100644 index 0000000..dbe8f9d --- /dev/null +++ b/cl/kernels.cl @@ -0,0 +1,49 @@ + +__kernel void sha1_16_test( + __global const unsigned char *in, + __global unsigned char *out) +{ + unsigned offset = get_global_id(0) * BLOCKS_PER_ITEM * 16; +#if BLOCKS_PER_ITEM != 1 + for(unsigned i = 0; i < BLOCKS_PER_ITEM; ++i){ +#endif + uint32_t local_buf[4]; + GET_UINT32_BE(local_buf[0], in, offset); + GET_UINT32_BE(local_buf[1], in, offset + 4); + GET_UINT32_BE(local_buf[2], in, offset + 8); + GET_UINT32_BE(local_buf[3], in, offset + 12); + sha1_16(local_buf, local_buf); + PUT_UINT32_BE(local_buf[0], out, offset); + PUT_UINT32_BE(local_buf[1], out, offset + 4); + PUT_UINT32_BE(local_buf[2], out, offset + 8); + PUT_UINT32_BE(local_buf[3], out, offset + 12); +#if BLOCKS_PER_ITEM != 1 + offset += 16; + } +#endif +} + +#define AES_BLOCK_SIZE 16 + +__kernel void aes_128_ecb_test( + __global const uint32_t *in, + __global uint32_t *out, + __constant const uint32_t *key) +{ + uint32_t rk[RK_LEN]; + rk[0] = key[0]; rk[1] = key[1]; rk[2] = key[2]; rk[3] = key[3]; + aes_set_key_enc_128(rk); + unsigned offset = get_global_id(0) * BLOCKS_PER_ITEM * AES_BLOCK_SIZE / 4; + in += offset; out += offset; +#if BLOCKS_PER_ITEM != 1 + for (unsigned i = 0; i < BLOCKS_PER_ITEM; ++i) { +#endif + uint32_t buf[4]; + buf[0] = in[0]; buf[1] = in[1]; buf[2] = in[2]; buf[3] = in[3]; + aes_encrypt_128(rk, buf, buf); + out[0] = buf[0]; out[1] = buf[1]; out[2] = buf[2]; out[3] = buf[3]; +#if BLOCKS_PER_ITEM != 1 + offset += AES_BLOCK_SIZE; + } +#endif +} diff --git a/cl/sha1_16.cl b/cl/sha1_16.cl new file mode 100644 index 0000000..9fabc70 --- /dev/null +++ b/cl/sha1_16.cl @@ -0,0 +1,177 @@ + +typedef unsigned int uint32_t; + +#ifndef GET_UINT32_BE +#define GET_UINT32_BE(n,b,i) \ +{ \ + (n) = ( (uint32_t) (b)[(i) ] << 24 ) \ + | ( (uint32_t) (b)[(i) + 1] << 16 ) \ + | ( (uint32_t) (b)[(i) + 2] << 8 ) \ + | ( (uint32_t) (b)[(i) + 3] ); \ +} +#endif + +#ifndef PUT_UINT32_BE +#define PUT_UINT32_BE(n,b,i) \ +{ \ + (b)[(i) ] = (unsigned char) ( (n) >> 24 ); \ + (b)[(i) + 1] = (unsigned char) ( (n) >> 16 ); \ + (b)[(i) + 2] = (unsigned char) ( (n) >> 8 ); \ + (b)[(i) + 3] = (unsigned char) ( (n) ); \ +} +#endif + +__constant const uint32_t + h0 = 0x67452301, + h1 = 0xEFCDAB89, + h2 = 0x98BADCFE, + h3 = 0x10325476, + h4 = 0xC3D2E1F0; + +void sha1_16(const uint32_t *in, uint32_t *out) +{ + uint32_t temp, W[16], + A = h0, B = h1, C = h2, D = h3, E = h4; + + W[0] = in[0]; W[1] = in[1]; W[2] = in[2]; W[3] = in[3]; + W[4] = 0x80000000u; W[5] = 0; W[6] = 0; W[7] = 0; + W[8] = 0; W[9] = 0; W[10] = 0; W[11] = 0; + W[12] = 0; W[13] = 0; W[14] = 0; W[15] = 0x80u; + +#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n))) + +#define R(t) \ +( \ + temp = W[( t - 3 ) & 0x0F] ^ W[( t - 8 ) & 0x0F] ^ \ + W[( t - 14 ) & 0x0F] ^ W[ t & 0x0F], \ + ( W[t & 0x0F] = S(temp,1) ) \ +) + +#define P(a,b,c,d,e,x) \ +{ \ + e += S(a,5) + F(b,c,d) + K + x; b = S(b,30); \ +} + +#define F(x,y,z) (z ^ (x & (y ^ z))) +#define K 0x5A827999 + + P( A, B, C, D, E, W[0] ); + P( E, A, B, C, D, W[1] ); + P( D, E, A, B, C, W[2] ); + P( C, D, E, A, B, W[3] ); + P( B, C, D, E, A, W[4] ); + P( A, B, C, D, E, W[5] ); + P( E, A, B, C, D, W[6] ); + P( D, E, A, B, C, W[7] ); + P( C, D, E, A, B, W[8] ); + P( B, C, D, E, A, W[9] ); + P( A, B, C, D, E, W[10] ); + P( E, A, B, C, D, W[11] ); + P( D, E, A, B, C, W[12] ); + P( C, D, E, A, B, W[13] ); + P( B, C, D, E, A, W[14] ); + P( A, B, C, D, E, W[15] ); + P( E, A, B, C, D, R(16) ); + P( D, E, A, B, C, R(17) ); + P( C, D, E, A, B, R(18) ); + P( B, C, D, E, A, R(19) ); + +#undef K +#undef F + +#define F(x,y,z) (x ^ y ^ z) +#define K 0x6ED9EBA1 + + P( A, B, C, D, E, R(20) ); + P( E, A, B, C, D, R(21) ); + P( D, E, A, B, C, R(22) ); + P( C, D, E, A, B, R(23) ); + P( B, C, D, E, A, R(24) ); + P( A, B, C, D, E, R(25) ); + P( E, A, B, C, D, R(26) ); + P( D, E, A, B, C, R(27) ); + P( C, D, E, A, B, R(28) ); + P( B, C, D, E, A, R(29) ); + P( A, B, C, D, E, R(30) ); + P( E, A, B, C, D, R(31) ); + P( D, E, A, B, C, R(32) ); + P( C, D, E, A, B, R(33) ); + P( B, C, D, E, A, R(34) ); + P( A, B, C, D, E, R(35) ); + P( E, A, B, C, D, R(36) ); + P( D, E, A, B, C, R(37) ); + P( C, D, E, A, B, R(38) ); + P( B, C, D, E, A, R(39) ); + +#undef K +#undef F + +#define F(x,y,z) ((x & y) | (z & (x | y))) +#define K 0x8F1BBCDC + + P( A, B, C, D, E, R(40) ); + P( E, A, B, C, D, R(41) ); + P( D, E, A, B, C, R(42) ); + P( C, D, E, A, B, R(43) ); + P( B, C, D, E, A, R(44) ); + P( A, B, C, D, E, R(45) ); + P( E, A, B, C, D, R(46) ); + P( D, E, A, B, C, R(47) ); + P( C, D, E, A, B, R(48) ); + P( B, C, D, E, A, R(49) ); + P( A, B, C, D, E, R(50) ); + P( E, A, B, C, D, R(51) ); + P( D, E, A, B, C, R(52) ); + P( C, D, E, A, B, R(53) ); + P( B, C, D, E, A, R(54) ); + P( A, B, C, D, E, R(55) ); + P( E, A, B, C, D, R(56) ); + P( D, E, A, B, C, R(57) ); + P( C, D, E, A, B, R(58) ); + P( B, C, D, E, A, R(59) ); + +#undef K +#undef F + +#define F(x,y,z) (x ^ y ^ z) +#define K 0xCA62C1D6 + + P( A, B, C, D, E, R(60) ); + P( E, A, B, C, D, R(61) ); + P( D, E, A, B, C, R(62) ); + P( C, D, E, A, B, R(63) ); + P( B, C, D, E, A, R(64) ); + P( A, B, C, D, E, R(65) ); + P( E, A, B, C, D, R(66) ); + P( D, E, A, B, C, R(67) ); + P( C, D, E, A, B, R(68) ); + P( B, C, D, E, A, R(69) ); + P( A, B, C, D, E, R(70) ); + P( E, A, B, C, D, R(71) ); + P( D, E, A, B, C, R(72) ); + P( C, D, E, A, B, R(73) ); + P( B, C, D, E, A, R(74) ); + P( A, B, C, D, E, R(75) ); + P( E, A, B, C, D, R(76) ); + P( D, E, A, B, C, R(77) ); + P( C, D, E, A, B, R(78) ); + P( B, C, D, E, A, R(79) ); + +#undef K +#undef F + +#undef S +#undef R +#undef P + + A += h0; + B += h1; + C += h2; + D += h3; + + out[0] = A; + out[1] = B; + out[2] = C; + out[3] = D; +} + diff --git a/crypto.h b/crypto.h index 49652ea..f9953a6 100644 --- a/crypto.h +++ b/crypto.h @@ -12,20 +12,6 @@ void sha1_16(const unsigned char in[16], unsigned char out[16]); void aes_gen_tables(void); -typedef struct { - unsigned char FSb[256]; - unsigned int FT0[256]; - unsigned int FT1[256]; - unsigned int FT2[256]; - unsigned int FT3[256]; - unsigned char RSb[256]; - unsigned int RT0[256]; - unsigned int RT1[256]; - unsigned int RT2[256]; - unsigned int RT3[256]; - unsigned int RCON[10]; -} AES_Tables; - void aes_set_key_enc_128(unsigned int rk[RK_LEN], const unsigned char *key); void aes_encrypt_128(const unsigned int rk[8], const unsigned char input[16], unsigned char output[16]); diff --git a/kernel.c b/kernel.c deleted file mode 100644 index 949e12b..0000000 --- a/kernel.c +++ /dev/null @@ -1,385 +0,0 @@ - -// see sha1_16.c and aes128.c - -typedef unsigned int uint32_t; - -#ifndef GET_UINT32_BE -#define GET_UINT32_BE(n,b,i) \ -{ \ - (n) = ( (uint32_t) (b)[(i) ] << 24 ) \ - | ( (uint32_t) (b)[(i) + 1] << 16 ) \ - | ( (uint32_t) (b)[(i) + 2] << 8 ) \ - | ( (uint32_t) (b)[(i) + 3] ); \ -} -#endif - -#ifndef PUT_UINT32_BE -#define PUT_UINT32_BE(n,b,i) \ -{ \ - (b)[(i) ] = (unsigned char) ( (n) >> 24 ); \ - (b)[(i) + 1] = (unsigned char) ( (n) >> 16 ); \ - (b)[(i) + 2] = (unsigned char) ( (n) >> 8 ); \ - (b)[(i) + 3] = (unsigned char) ( (n) ); \ -} -#endif - -void sha1_16(const uint32_t *in, uint32_t *out){ - const uint32_t - h0 = 0x67452301, - h1 = 0xEFCDAB89, - h2 = 0x98BADCFE, - h3 = 0x10325476, - h4 = 0xC3D2E1F0; - - uint32_t temp, W[16], - A = h0, B = h1, C = h2, D = h3, E = h4; - - W[0] = in[0]; W[1] = in[1]; W[2] = in[2]; W[3] = in[3]; - W[4] = 0x80000000u; W[5] = 0; W[6] = 0; W[7] = 0; - W[8] = 0; W[9] = 0; W[10] = 0; W[11] = 0; - W[12] = 0; W[13] = 0; W[14] = 0; W[15] = 0x80u; - -#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n))) - -#define R(t) \ -( \ - temp = W[( t - 3 ) & 0x0F] ^ W[( t - 8 ) & 0x0F] ^ \ - W[( t - 14 ) & 0x0F] ^ W[ t & 0x0F], \ - ( W[t & 0x0F] = S(temp,1) ) \ -) - -#define P(a,b,c,d,e,x) \ -{ \ - e += S(a,5) + F(b,c,d) + K + x; b = S(b,30); \ -} - -#define F(x,y,z) (z ^ (x & (y ^ z))) -#define K 0x5A827999 - - P( A, B, C, D, E, W[0] ); - P( E, A, B, C, D, W[1] ); - P( D, E, A, B, C, W[2] ); - P( C, D, E, A, B, W[3] ); - P( B, C, D, E, A, W[4] ); - P( A, B, C, D, E, W[5] ); - P( E, A, B, C, D, W[6] ); - P( D, E, A, B, C, W[7] ); - P( C, D, E, A, B, W[8] ); - P( B, C, D, E, A, W[9] ); - P( A, B, C, D, E, W[10] ); - P( E, A, B, C, D, W[11] ); - P( D, E, A, B, C, W[12] ); - P( C, D, E, A, B, W[13] ); - P( B, C, D, E, A, W[14] ); - P( A, B, C, D, E, W[15] ); - P( E, A, B, C, D, R(16) ); - P( D, E, A, B, C, R(17) ); - P( C, D, E, A, B, R(18) ); - P( B, C, D, E, A, R(19) ); - -#undef K -#undef F - -#define F(x,y,z) (x ^ y ^ z) -#define K 0x6ED9EBA1 - - P( A, B, C, D, E, R(20) ); - P( E, A, B, C, D, R(21) ); - P( D, E, A, B, C, R(22) ); - P( C, D, E, A, B, R(23) ); - P( B, C, D, E, A, R(24) ); - P( A, B, C, D, E, R(25) ); - P( E, A, B, C, D, R(26) ); - P( D, E, A, B, C, R(27) ); - P( C, D, E, A, B, R(28) ); - P( B, C, D, E, A, R(29) ); - P( A, B, C, D, E, R(30) ); - P( E, A, B, C, D, R(31) ); - P( D, E, A, B, C, R(32) ); - P( C, D, E, A, B, R(33) ); - P( B, C, D, E, A, R(34) ); - P( A, B, C, D, E, R(35) ); - P( E, A, B, C, D, R(36) ); - P( D, E, A, B, C, R(37) ); - P( C, D, E, A, B, R(38) ); - P( B, C, D, E, A, R(39) ); - -#undef K -#undef F - -#define F(x,y,z) ((x & y) | (z & (x | y))) -#define K 0x8F1BBCDC - - P( A, B, C, D, E, R(40) ); - P( E, A, B, C, D, R(41) ); - P( D, E, A, B, C, R(42) ); - P( C, D, E, A, B, R(43) ); - P( B, C, D, E, A, R(44) ); - P( A, B, C, D, E, R(45) ); - P( E, A, B, C, D, R(46) ); - P( D, E, A, B, C, R(47) ); - P( C, D, E, A, B, R(48) ); - P( B, C, D, E, A, R(49) ); - P( A, B, C, D, E, R(50) ); - P( E, A, B, C, D, R(51) ); - P( D, E, A, B, C, R(52) ); - P( C, D, E, A, B, R(53) ); - P( B, C, D, E, A, R(54) ); - P( A, B, C, D, E, R(55) ); - P( E, A, B, C, D, R(56) ); - P( D, E, A, B, C, R(57) ); - P( C, D, E, A, B, R(58) ); - P( B, C, D, E, A, R(59) ); - -#undef K -#undef F - -#define F(x,y,z) (x ^ y ^ z) -#define K 0xCA62C1D6 - - P( A, B, C, D, E, R(60) ); - P( E, A, B, C, D, R(61) ); - P( D, E, A, B, C, R(62) ); - P( C, D, E, A, B, R(63) ); - P( B, C, D, E, A, R(64) ); - P( A, B, C, D, E, R(65) ); - P( E, A, B, C, D, R(66) ); - P( D, E, A, B, C, R(67) ); - P( C, D, E, A, B, R(68) ); - P( B, C, D, E, A, R(69) ); - P( A, B, C, D, E, R(70) ); - P( E, A, B, C, D, R(71) ); - P( D, E, A, B, C, R(72) ); - P( C, D, E, A, B, R(73) ); - P( B, C, D, E, A, R(74) ); - P( A, B, C, D, E, R(75) ); - P( E, A, B, C, D, R(76) ); - P( D, E, A, B, C, R(77) ); - P( C, D, E, A, B, R(78) ); - P( B, C, D, E, A, R(79) ); - -#undef K -#undef F - -#undef S -#undef R -#undef P - - A += h0; - B += h1; - C += h2; - D += h3; - - out[0] = A; - out[1] = B; - out[2] = C; - out[3] = D; -} - -// cl doesn't have include, keep this identical to crypto.h -typedef struct { - unsigned char fSb[256]; - unsigned int fT0[256]; - unsigned int fT1[256]; - unsigned int fT2[256]; - unsigned int fT3[256]; - unsigned char rSb[256]; - unsigned int rT0[256]; - unsigned int rT1[256]; - unsigned int rT2[256]; - unsigned int rT3[256]; - unsigned int rCON[10]; -} AES_Tables; -// I hope this doesn't induce any performace penalties -#define FSb p_tables->fSb -#define FT0 p_tables->fT0 -#define FT1 p_tables->fT1 -#define FT2 p_tables->fT2 -#define FT3 p_tables->fT3 -#define RSb p_tables->rSb -#define RT0 p_tables->rT0 -#define RT1 p_tables->rT1 -#define RT2 p_tables->rT2 -#define RT3 p_tables->rT3 -#define RCON p_tables->rCON - -/* OpenCL doesn't allow this kind of pointer cast -#define GET_UINT32_LE(n, b, i) \ - (n) = *(uint32_t*)(b + i) -#define PUT_UINT32_LE(n, b, i) \ - *(uint32_t*)(b + i) = (n) -*/ - -#ifndef GET_UINT32_LE -#define GET_UINT32_LE(n,b,i) \ -{ \ - (n) = ( (uint32_t) (b)[(i) ] ) \ - | ( (uint32_t) (b)[(i) + 1] << 8 ) \ - | ( (uint32_t) (b)[(i) + 2] << 16 ) \ - | ( (uint32_t) (b)[(i) + 3] << 24 ); \ -} -#endif - -#ifndef PUT_UINT32_LE -#define PUT_UINT32_LE(n,b,i) \ -{ \ - (b)[(i) ] = (unsigned char) ( ( (n) ) & 0xFF ); \ - (b)[(i) + 1] = (unsigned char) ( ( (n) >> 8 ) & 0xFF ); \ - (b)[(i) + 2] = (unsigned char) ( ( (n) >> 16 ) & 0xFF ); \ - (b)[(i) + 3] = (unsigned char) ( ( (n) >> 24 ) & 0xFF ); \ -} -#endif - -#define RK_LEN 44 - -void aes_set_key_enc_128(__global const AES_Tables *p_tables, - uint32_t rk[RK_LEN] -){ - uint32_t *RK = rk; - - for (unsigned i = 0; i < 10; ++i, RK += 4) { - RK[4] = RK[0] ^ RCON[i] ^ - ( (uint32_t) FSb[ ( RK[3] >> 8 ) & 0xFF ] ) ^ - ( (uint32_t) FSb[ ( RK[3] >> 16 ) & 0xFF ] << 8 ) ^ - ( (uint32_t) FSb[ ( RK[3] >> 24 ) & 0xFF ] << 16 ) ^ - ( (uint32_t) FSb[ ( RK[3] ) & 0xFF ] << 24 ); - - RK[5] = RK[1] ^ RK[4]; - RK[6] = RK[2] ^ RK[5]; - RK[7] = RK[3] ^ RK[6]; - } -} - -#define AES_FROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3) \ -{ \ - X0 = *RK++ ^ FT0[ ( Y0 ) & 0xFF ] ^ \ - FT1[ ( Y1 >> 8 ) & 0xFF ] ^ \ - FT2[ ( Y2 >> 16 ) & 0xFF ] ^ \ - FT3[ ( Y3 >> 24 ) & 0xFF ]; \ - \ - X1 = *RK++ ^ FT0[ ( Y1 ) & 0xFF ] ^ \ - FT1[ ( Y2 >> 8 ) & 0xFF ] ^ \ - FT2[ ( Y3 >> 16 ) & 0xFF ] ^ \ - FT3[ ( Y0 >> 24 ) & 0xFF ]; \ - \ - X2 = *RK++ ^ FT0[ ( Y2 ) & 0xFF ] ^ \ - FT1[ ( Y3 >> 8 ) & 0xFF ] ^ \ - FT2[ ( Y0 >> 16 ) & 0xFF ] ^ \ - FT3[ ( Y1 >> 24 ) & 0xFF ]; \ - \ - X3 = *RK++ ^ FT0[ ( Y3 ) & 0xFF ] ^ \ - FT1[ ( Y0 >> 8 ) & 0xFF ] ^ \ - FT2[ ( Y1 >> 16 ) & 0xFF ] ^ \ - FT3[ ( Y2 >> 24 ) & 0xFF ]; \ -} - -inline void aes_encrypt_128(__global const AES_Tables *p_tables, - const uint32_t rk[RK_LEN], - const uint32_t *in, uint32_t *out -){ - const uint32_t *RK = rk; - uint32_t X0 = in[0], X1 = in[1], X2 = in[2], X3 = in[3], - Y0, Y1, Y2, Y3; - - X0 ^= *RK++; - X1 ^= *RK++; - X2 ^= *RK++; - X3 ^= *RK++; - - AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); - AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 ); - AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); - AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 ); - AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); - AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 ); - AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); - AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 ); - AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 ); - - X0 = *RK++ ^ \ - ( (uint32_t) FSb[ ( Y0 ) & 0xFF ] ) ^ - ( (uint32_t) FSb[ ( Y1 >> 8 ) & 0xFF ] << 8 ) ^ - ( (uint32_t) FSb[ ( Y2 >> 16 ) & 0xFF ] << 16 ) ^ - ( (uint32_t) FSb[ ( Y3 >> 24 ) & 0xFF ] << 24 ); - - X1 = *RK++ ^ \ - ( (uint32_t) FSb[ ( Y1 ) & 0xFF ] ) ^ - ( (uint32_t) FSb[ ( Y2 >> 8 ) & 0xFF ] << 8 ) ^ - ( (uint32_t) FSb[ ( Y3 >> 16 ) & 0xFF ] << 16 ) ^ - ( (uint32_t) FSb[ ( Y0 >> 24 ) & 0xFF ] << 24 ); - - X2 = *RK++ ^ \ - ( (uint32_t) FSb[ ( Y2 ) & 0xFF ] ) ^ - ( (uint32_t) FSb[ ( Y3 >> 8 ) & 0xFF ] << 8 ) ^ - ( (uint32_t) FSb[ ( Y0 >> 16 ) & 0xFF ] << 16 ) ^ - ( (uint32_t) FSb[ ( Y1 >> 24 ) & 0xFF ] << 24 ); - - X3 = *RK ^ \ - ( (uint32_t) FSb[ ( Y3 ) & 0xFF ] ) ^ - ( (uint32_t) FSb[ ( Y0 >> 8 ) & 0xFF ] << 8 ) ^ - ( (uint32_t) FSb[ ( Y1 >> 16 ) & 0xFF ] << 16 ) ^ - ( (uint32_t) FSb[ ( Y2 >> 24 ) & 0xFF ] << 24 ); - - out[0] = X0; - out[1] = X1; - out[2] = X2; - out[3] = X3; -} - -__kernel void sha1_16_test( - __global const unsigned char *in, - __global unsigned char *out -){ - unsigned offset = get_global_id(0) * BLOCKS_PER_ITEM * 16; -#if BLOCKS_PER_ITEM != 1 - for(unsigned i = 0; i < BLOCKS_PER_ITEM; ++i){ -#endif - uint32_t local_buf[4]; - GET_UINT32_BE(local_buf[0], in, offset); - GET_UINT32_BE(local_buf[1], in, offset + 4); - GET_UINT32_BE(local_buf[2], in, offset + 8); - GET_UINT32_BE(local_buf[3], in, offset + 12); - sha1_16(local_buf, local_buf); - PUT_UINT32_BE(local_buf[0], out, offset); - PUT_UINT32_BE(local_buf[1], out, offset + 4); - PUT_UINT32_BE(local_buf[2], out, offset + 8); - PUT_UINT32_BE(local_buf[3], out, offset + 12); -#if BLOCKS_PER_ITEM != 1 - offset += 16; - } -#endif -} - -#define AES_BLOCK_SIZE 16; - -__kernel void aes_128_ecb_test( - __global const unsigned char *in, - __global unsigned char *out, - __global const AES_Tables *p_tables, - __global const unsigned char *key -){ - uint32_t rk[RK_LEN]; - GET_UINT32_LE(rk[0], key, 0); - GET_UINT32_LE(rk[1], key, 4); - GET_UINT32_LE(rk[2], key, 8); - GET_UINT32_LE(rk[3], key, 12); - aes_set_key_enc_128(p_tables, rk); - unsigned offset = get_global_id(0) * BLOCKS_PER_ITEM * AES_BLOCK_SIZE; -#if BLOCKS_PER_ITEM != 1 - for (unsigned i = 0; i < BLOCKS_PER_ITEM; ++i) { -#endif - uint32_t local_buf[4]; - GET_UINT32_LE(local_buf[0], in, offset); - GET_UINT32_LE(local_buf[1], in, offset + 4); - GET_UINT32_LE(local_buf[2], in, offset + 8); - GET_UINT32_LE(local_buf[3], in, offset + 12); - aes_encrypt_128(p_tables, rk, local_buf, local_buf); - PUT_UINT32_LE(local_buf[0], out, offset); - PUT_UINT32_LE(local_buf[1], out, offset + 4); - PUT_UINT32_LE(local_buf[2], out, offset + 8); - PUT_UINT32_LE(local_buf[3], out, offset + 12); -#if BLOCKS_PER_ITEM != 1 - offset += AES_BLOCK_SIZE; - } -#endif -}