diff --git a/.gitignore b/.gitignore index 1ce4443..20c43da 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,5 @@ *.o *.exe *.sln -*.vcxproj* +*.vc* x64/ diff --git a/Makefile b/Makefile index 0ffe5bd..e061db9 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # only tested in mingw PNAME = bfcl -OBJS = $(PNAME).o ocl_util.o utils.o sha1_16.o aes_128.o +OBJS = $(PNAME).o ocl_util.o utils.o sha1_16.o aes_128.o ocl_test.o ocl_brute.o CFLAGS += -std=c11 -Wall -O2 -mrdrnd -I$(INTELOCLSDKROOT)/include LDFLAGS += -L$(INTELOCLSDKROOT)/lib/x64 diff --git a/bfcl.c b/bfcl.c index 2fb5d83..a50f26c 100644 --- a/bfcl.c +++ b/bfcl.c @@ -1,285 +1,19 @@ #include -#include -#include -#include #include "ocl.h" -#include "crypto.h" -#include "utils.h" -#ifdef __GNUC__ -#include -#elif _MSC_VER -#include -#endif - -int cpu_has_rdrand() { -#if __GNUC__ - unsigned a = 0, b = 0, c = 0, d = 0; - __get_cpuid(1, &a, &b, &c, &d); - return c & bit_RDRND; -#elif _MSC_VER - int regs[4]; - __cpuid(regs, 1); - return regs[2] & (1<<30); -#else - // ICL only? - return _may_i_use_cpu_feature(_FEATURE_RDRND); -#endif -} - -// CAUTION: caller is responsible to free the buf -char * read_file(const char *file_name, size_t *p_size) { - FILE * f = fopen(file_name, "rb"); - if (f == NULL) { - printf("can't read file: %s", file_name); - exit(-1); - } - fseek(f, 0, SEEK_END); - *p_size = ftell(f); - char * buf = malloc(*p_size); - fseek(f, 0, SEEK_SET); - fread(buf, *p_size, 1, f); - fclose(f); - return buf; -} - -void read_files(unsigned num_files, const char *file_names[], char *ptrs[], size_t sizes[]) { - for (unsigned i = 0; i < num_files; ++i) { - ptrs[i] = read_file(file_names[i], &sizes[i]); - } -} - -void dump_to_file(const char *file_name, const void *buf, size_t len) { - FILE *f = fopen(file_name, "wb"); - if (f == NULL) { - printf("can't open file to write: %s\n", file_name); - return; - } - fwrite(buf, len, 1, f); - fclose(f); -} - -#define TEST_SHA1_16 1 -#define TEST_AES_128_ECB 2 - -#define BLOCK_SIZE 0x10 -#define NUM_BLOCKS (1 << 23) -#define BLOCKS_PER_ITEM 1 - -void ocl_test(cl_device_id device_id, const cl_uchar *buf_in, int test_case) { - cl_int err; - cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err)); - cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err)); - - HP_Time t0, t1; - long long td; - - const size_t num_items = NUM_BLOCKS / BLOCKS_PER_ITEM; - const size_t io_buf_len = NUM_BLOCKS * BLOCK_SIZE; - - const char *source_names[] = { "cl/sha1_16.cl", "cl/aes_tables.cl", "cl/aes_128.cl", "cl/kernels.cl" }; - const unsigned num_sources = sizeof(source_names) / sizeof(char *); - char *sources[sizeof(source_names) / sizeof(char *)]; - size_t source_sizes[sizeof(source_names) / sizeof(char *)]; - read_files(num_sources, source_names, sources, source_sizes); - - get_hp_time(&t0); - // WTF? GCC complains if I pass char ** in to a function expecting const char **? - cl_program program = OCL_ASSERT2(clCreateProgramWithSource(context, num_sources, (const char **)sources, source_sizes, &err)); - char options[0x100]; - sprintf(options, "-w -Werror -DBLOCKS_PER_ITEM=%d", BLOCKS_PER_ITEM); - // printf("compiler options: %s\n", options); - err = clBuildProgram(program, 0, NULL, options, NULL, NULL); - get_hp_time(&t1); - printf("%d microseconds for compile\n", (int)hp_time_diff(&t0, &t1)); - for (unsigned i = 0; i < num_sources; ++i) { - free(sources[i]); - } - if (err != CL_SUCCESS) { - fprintf(stderr, "failed to build program, error: %s, build log:\n", ocl_err_msg(err)); - size_t len; - clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len); - char *buf_log = malloc(len + 1); - clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, len, buf_log, NULL); - buf_log[len] = '\0'; - fprintf(stderr, "%s\n", buf_log); - free(buf_log); - exit(err); - } - - const char * test_name; - switch (test_case) { - case TEST_SHA1_16: test_name = "sha1_16_test"; break; - case TEST_AES_128_ECB: test_name = "aes_128_ecb_test"; break; - default: exit(-1); - } - - printf("%s on %u MB\n", test_name, (unsigned)io_buf_len >> 20); +void ocl_test(); - cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err)); - - cl_uchar key[16]; - unsigned int aes_rk[RK_LEN]; - if (test_case == TEST_AES_128_ECB) { - aes_gen_tables(); - for (unsigned i = 0; i < 16; ++i) { - key[i] = rand() & 0xff; - } - printf("Key: %s\n", hexdump(key, 16, 0)); - } - - cl_mem mem_in = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, io_buf_len, NULL, &err)); - cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, io_buf_len, NULL, &err)); - cl_mem mem_key; - if (test_case == TEST_AES_128_ECB) { - mem_key = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, 16, NULL, &err)); - } - - get_hp_time(&t0); - OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_in, CL_TRUE, 0, io_buf_len, buf_in, 0, NULL, NULL)); - if (test_case == TEST_AES_128_ECB) { - OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_key, CL_TRUE, 0, 16, key, 0, NULL, NULL)); - } - get_hp_time(&t1); - td = hp_time_diff(&t0, &t1); - printf("%d microseconds for data upload, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td); - - OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_in)); - OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_out)); - if (test_case == TEST_AES_128_ECB) { - OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_key)); - } - - size_t local; - OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL)); - printf("local work size: %u\n", (unsigned)local); - - get_hp_time(&t0); - // apparently, setting local work size to NULL doesn't affect performance, at least in this kind of work - OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL)); - // OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, NULL, 0, NULL, NULL)); - clFinish(command_queue); - get_hp_time(&t1); - td = hp_time_diff(&t0, &t1); - printf("%d microseconds for OpenCL, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td); - - cl_uchar *buf_out = malloc(io_buf_len); - get_hp_time(&t0); - OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, io_buf_len, buf_out, 0, NULL, NULL)); - get_hp_time(&t1); - td = hp_time_diff(&t0, &t1); - printf("%d microseconds for data download, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td); - - /* - if(test_case == TEST_AES_128_ECB){ - dump_to_file("r:/test_aes_in.bin", buf_in, io_buf_len); - dump_to_file("r:/test_aes_out.bin", buf_out, io_buf_len); - } - */ - - cl_uchar *buf_verify = malloc(io_buf_len); - get_hp_time(&t0); - if (test_case == TEST_SHA1_16) { - for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) { - sha1_16(buf_in + offset, buf_verify + offset); - } - } else { - for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) { - // setting the same key over and over is stupid - // yet we still do it to keep it in line with the OpenCL port - // otherwise we can't test the set key in OpenCL - aes_set_key_enc_128(aes_rk, key); - aes_encrypt_128(aes_rk, buf_in + offset, buf_verify + offset); - } - } - get_hp_time(&t1); - td = hp_time_diff(&t0, &t1); - printf("%d microseconds for C(single thread), %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td); - - if (memcmp(buf_verify, buf_out, io_buf_len)) { - printf("%s: verification failed\n", test_name); - for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) { - if (memcmp(buf_verify + offset, buf_out + offset, BLOCK_SIZE)) { - printf("first difference @ 0x%08x/0x%08x:\n", offset, (unsigned)num_items ); - printf("\t%s\n", hexdump(buf_in + offset, BLOCK_SIZE, 0)); - printf("\t%s\n", hexdump(buf_verify + offset, BLOCK_SIZE, 0)); - printf("\t%s\n", hexdump(buf_out + offset, BLOCK_SIZE, 0)); - break; - } - } - } else { - printf("%s: succeed\n", test_name); - } - - free(buf_out); - clReleaseMemObject(mem_in); - clReleaseMemObject(mem_out); - clReleaseProgram(program); - clReleaseKernel(kernel); - clReleaseCommandQueue(command_queue); - clReleaseContext(context); -} +void ocl_brute(); int main(int argc, const char *argv[]) { if (argc == 2 && !strcmp(argv[1], "info")) { cl_uint num_platforms; ocl_info(&num_platforms, 1); + } else if (argc == 2 && !strcmp(argv[1], "console_id")){ + ocl_brute(); } else if (argc == 1){ - cl_platform_id platform_id; - cl_device_id device_id; - ocl_get_device(&platform_id, &device_id); - if (platform_id == NULL || device_id == NULL) { - return -1; - } - - cl_uchar *buf_in = malloc(BLOCK_SIZE * NUM_BLOCKS); - srand(2501); - HP_Time t0, t1; long long td; - get_hp_time(&t0); - if(cpu_has_rdrand()){ - // ~190 MB/s @ X230, ~200 without the success check - printf("randomize source buffer using RDRAND\n"); - unsigned long long *p = (unsigned long long *)buf_in; - unsigned long long *p_end = (unsigned long long *)(buf_in + BLOCK_SIZE * NUM_BLOCKS); - int success = 1; - while (p < p_end) { - success &= _rdrand64_step(p++); - } - if (!success) { - printf("RDRND failed\n"); - exit(-1); - } - }else { - printf("randomize source buffer using AES OFB\n"); - // rand() & 0xff is about ~60 MB/s @ X230 - // it's worse than that AES single thread C, so OFB it is - // ~240 MB/s, even faster than RDRAND - srand(2501); - unsigned int aes_rk[RK_LEN]; - unsigned char key_iv[16 * 2]; - for (unsigned i = 0; i < 16 * 2; ++i) { - key_iv[i] = rand() & 0xff; - } - aes_set_key_enc_128(aes_rk, key_iv); - aes_encrypt_128(aes_rk, key_iv + 16, buf_in); - unsigned char *p_in = buf_in, *p_out = buf_in + 16, - *p_end = buf_in + BLOCK_SIZE * NUM_BLOCKS; - while (p_out < p_end) { - aes_encrypt_128(aes_rk, p_in, p_out); - p_in = p_out; - p_out += 16; - } - } - get_hp_time(&t1); - td = hp_time_diff(&t0, &t1); - printf("%d microseconds for preparing test data, %.2f MB/s\n", - (int)td, BLOCK_SIZE * NUM_BLOCKS * 1.0f / td); - - ocl_test(device_id, buf_in, TEST_SHA1_16); - ocl_test(device_id, buf_in, TEST_AES_128_ECB); - - free(buf_in); + ocl_test(); #ifdef _WIN32 system("pause"); #endif diff --git a/cl/aes_128.cl b/cl/aes_128.cl index 1da8a1e..2ecc72e 100644 --- a/cl/aes_128.cl +++ b/cl/aes_128.cl @@ -1,18 +1,9 @@ -// OpenCL has these fancy address space qualifiers that can't be cast without -#define GET_UINT32_LE(n, b, i) \ - (n) = *(uint32_t*)(b + i) -#define GET_UINT32_LE_G(n, b, i) \ - (n) = *(__global uint32_t*)(b + i) -#define GET_UINT32_LE_C(n, b, i) \ - (n) = *(__constant uint32_t*)(b + i) -#define PUT_UINT32_LE(n, b, i) \ - *(uint32_t*)(b + i) = (n) -#define PUT_UINT32_LE_G(n, b, i) \ - *(__global uint32_t*)(b + i) = (n) +// AES 128 ECB adapted for OpenCL, see "aes_128.c" for more info #define RK_LEN 44 +// the caller is responsible to put the key in rk void aes_set_key_enc_128(uint32_t rk[RK_LEN]) { uint32_t *RK = rk; diff --git a/cl/aes_tables.cl b/cl/aes_tables.cl index c2e65ae..22566ec 100644 --- a/cl/aes_tables.cl +++ b/cl/aes_tables.cl @@ -1,8 +1,11 @@ +// I'm obsessed with the idea of generating this file in memory +// on the other hand I know that's pointless... + /* * Forward S-box */ -__constant static const unsigned char FSb[256] = +static const unsigned char FSb[256] = { 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, @@ -109,19 +112,19 @@ __constant static const unsigned char FSb[256] = V(CB,B0,B0,7B), V(FC,54,54,A8), V(D6,BB,BB,6D), V(3A,16,16,2C) #define V(a,b,c,d) 0x##a##b##c##d -__constant static const uint32_t FT0[256] = { FT }; +static const uint32_t FT0[256] = { FT }; #undef V #define V(a,b,c,d) 0x##b##c##d##a -__constant static const uint32_t FT1[256] = { FT }; +static const uint32_t FT1[256] = { FT }; #undef V #define V(a,b,c,d) 0x##c##d##a##b -__constant static const uint32_t FT2[256] = { FT }; +static const uint32_t FT2[256] = { FT }; #undef V #define V(a,b,c,d) 0x##d##a##b##c -__constant static const uint32_t FT3[256] = { FT }; +static const uint32_t FT3[256] = { FT }; #undef V #undef FT @@ -129,7 +132,7 @@ __constant static const uint32_t FT3[256] = { FT }; /* * Round constants */ -__constant static const uint32_t RCON[10] = +static const uint32_t RCON[10] = { 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080, diff --git a/cl/common.h b/cl/common.h new file mode 100644 index 0000000..1c00dc6 --- /dev/null +++ b/cl/common.h @@ -0,0 +1,39 @@ + +typedef unsigned int uint32_t; + +typedef unsigned char u8; +typedef unsigned int u32; +typedef unsigned long u64; + +#ifndef GET_UINT32_BE +#define GET_UINT32_BE(n,b,i) \ +{ \ + (n) = ( (uint32_t) (b)[(i) ] << 24 ) \ + | ( (uint32_t) (b)[(i) + 1] << 16 ) \ + | ( (uint32_t) (b)[(i) + 2] << 8 ) \ + | ( (uint32_t) (b)[(i) + 3] ); \ +} +#endif + +#ifndef PUT_UINT32_BE +#define PUT_UINT32_BE(n,b,i) \ +{ \ + (b)[(i) ] = (unsigned char) ( (n) >> 24 ); \ + (b)[(i) + 1] = (unsigned char) ( (n) >> 16 ); \ + (b)[(i) + 2] = (unsigned char) ( (n) >> 8 ); \ + (b)[(i) + 3] = (unsigned char) ( (n) ); \ +} +#endif + +// OpenCL has these fancy address space qualifiers that can't be cast without +#define GET_UINT32_LE(n, b, i) \ + (n) = *(uint32_t*)(b + i) +#define GET_UINT32_LE_G(n, b, i) \ + (n) = *(__global uint32_t*)(b + i) +#define GET_UINT32_LE_C(n, b, i) \ + (n) = *(__constant uint32_t*)(b + i) +#define PUT_UINT32_LE(n, b, i) \ + *(uint32_t*)(b + i) = (n) +#define PUT_UINT32_LE_G(n, b, i) \ + *(__global uint32_t*)(b + i) = (n) + diff --git a/cl/dsi.cl b/cl/dsi.cl new file mode 100644 index 0000000..6d7af7c --- /dev/null +++ b/cl/dsi.cl @@ -0,0 +1,67 @@ + +// more about this: https://github.com/Jimmy-Z/TWLbf/blob/master/dsi.c + +static const u64 DSi_KEY_Y[2] = + {0xbd4dc4d30ab9dc76ull, 0xe1a00005202ddd1dull}; + +static const u64 DSi_KEY_MAGIC[2] = + {0x2a680f5f1a4f3e79ull, 0xfffefb4e29590258ull}; + +static inline void xor_128(u64 *x, const u64 *a, const u64 *b){ + x[0] = a[0] ^ b[0]; + x[1] = a[1] ^ b[1]; +} + +static inline void add_128(u64 *a, const u64 *b){ + a[0] += b[0]; + if(a[0] < b[0]){ + a[1] += b[1] + 1; + }else{ + a[1] += b[1]; + } +} + +static inline void add_128_64(u64 *a, u64 b){ + a[0] += b; + if(a[0] < b){ + a[1] += 1; + } +} + +// Answer to life, universe and everything. +static inline void rol42_128(u64 *a){ + u64 t = a[1]; + a[1] = (t << 42 ) | (a[0] >> 22); + a[0] = (a[0] << 42 ) | (t >> 22); +} + +// eMMC Encryption for MBR/Partitions (AES-CTR, with console-specific key) +void dsi_make_key(u64 *key, u64 console_id){ + u32 h = console_id >> 32, l = (u32)console_id; + u32 key_x[4] = {l, l ^ 0x24ee6906, h ^ 0xe65b601d, h}; + // Key = ((Key_X XOR Key_Y) + FFFEFB4E295902582A680F5F1A4F3E79h) ROL 42 + // equivalent to F_XY in twltool/f_xy.c + xor_128(key, (u64*)key_x, DSi_KEY_Y); + add_128(key, DSi_KEY_MAGIC); + rol42_128(key); +} + +// CAUTION this one doesn't work in-place +void byte_reverse_16(u8 *out, const u8 *in){ + out[0] = in[15]; + out[1] = in[14]; + out[2] = in[13]; + out[3] = in[12]; + out[4] = in[11]; + out[5] = in[10]; + out[6] = in[9]; + out[7] = in[8]; + out[8] = in[7]; + out[9] = in[6]; + out[10] = in[5]; + out[11] = in[4]; + out[12] = in[3]; + out[13] = in[2]; + out[14] = in[1]; + out[15] = in[0]; +} diff --git a/cl/kernel_console_id.cl b/cl/kernel_console_id.cl new file mode 100644 index 0000000..819bfa1 --- /dev/null +++ b/cl/kernel_console_id.cl @@ -0,0 +1,31 @@ + +// the caller should feed the target xor pad byte reversed as two uint64_t +// the ctr from emmc_cid_sha1 byte reversed as 4 uint32_t +__kernel void test_console_id( + u64 xor_l, u64 xor_h, + u64 console_id_template, + u32 ctr0, u32 ctr1, u32 ctr2, u32 ctr3, + __global int *success, + __global u64 *console_id_out) +{ + if(success){ + return; + } + // TODO: BCD conversion + u64 console_id = get_global_id(0) | console_id_template; + u64 dsi_key[2]; + dsi_make_key(dsi_key, console_id); + + u32 aes_rk[RK_LEN]; + byte_reverse_16((u8*)aes_rk, (u8*)dsi_key); + aes_set_key_enc_128(aes_rk); + + u32 ctr[4] = {ctr0, ctr1, ctr2, ctr3}; + u64 xor[2]; + aes_encrypt_128(aes_rk, ctr, (u32*)xor); + + if(xor[0] == xor_l && xor[1] == xor_h){ + *success = 1; + *console_id_out = console_id; + } +} diff --git a/cl/kernels.cl b/cl/kernel_tests.cl similarity index 88% rename from cl/kernels.cl rename to cl/kernel_tests.cl index dbe8f9d..1e8f294 100644 --- a/cl/kernels.cl +++ b/cl/kernel_tests.cl @@ -1,6 +1,6 @@ __kernel void sha1_16_test( - __global const unsigned char *in, + __constant const unsigned char *in, __global unsigned char *out) { unsigned offset = get_global_id(0) * BLOCKS_PER_ITEM * 16; @@ -26,9 +26,9 @@ __kernel void sha1_16_test( #define AES_BLOCK_SIZE 16 __kernel void aes_128_ecb_test( - __global const uint32_t *in, - __global uint32_t *out, - __constant const uint32_t *key) + __constant const uint32_t *key, + __constant const uint32_t *in, + __global uint32_t *out) { uint32_t rk[RK_LEN]; rk[0] = key[0]; rk[1] = key[1]; rk[2] = key[2]; rk[3] = key[3]; @@ -43,7 +43,7 @@ __kernel void aes_128_ecb_test( aes_encrypt_128(rk, buf, buf); out[0] = buf[0]; out[1] = buf[1]; out[2] = buf[2]; out[3] = buf[3]; #if BLOCKS_PER_ITEM != 1 - offset += AES_BLOCK_SIZE; + offset += AES_BLOCK_SIZE / 4; } #endif } diff --git a/cl/sha1_16.cl b/cl/sha1_16.cl index 9fabc70..3b1eb89 100644 --- a/cl/sha1_16.cl +++ b/cl/sha1_16.cl @@ -1,25 +1,5 @@ -typedef unsigned int uint32_t; - -#ifndef GET_UINT32_BE -#define GET_UINT32_BE(n,b,i) \ -{ \ - (n) = ( (uint32_t) (b)[(i) ] << 24 ) \ - | ( (uint32_t) (b)[(i) + 1] << 16 ) \ - | ( (uint32_t) (b)[(i) + 2] << 8 ) \ - | ( (uint32_t) (b)[(i) + 3] ); \ -} -#endif - -#ifndef PUT_UINT32_BE -#define PUT_UINT32_BE(n,b,i) \ -{ \ - (b)[(i) ] = (unsigned char) ( (n) >> 24 ); \ - (b)[(i) + 1] = (unsigned char) ( (n) >> 16 ); \ - (b)[(i) + 2] = (unsigned char) ( (n) >> 8 ); \ - (b)[(i) + 3] = (unsigned char) ( (n) ); \ -} -#endif +// sha1_16 adapted for OpenCL, see "sha1_16.c" for more information __constant const uint32_t h0 = 0x67452301, diff --git a/ocl.h b/ocl.h index e7d3f55..81b074c 100644 --- a/ocl.h +++ b/ocl.h @@ -46,3 +46,7 @@ void ocl_assert(cl_int ret, const char * code, const char * file, OCL_Platform *ocl_info(cl_uint *p_num_platforms, int verbose); void ocl_get_device(cl_platform_id *p_platform_id, cl_device_id *p_device_id); + +cl_program ocl_build_from_sources( + unsigned num_sources, const char *source_names[], + cl_context context, cl_device_id device_id, const char * options); diff --git a/ocl_brute.c b/ocl_brute.c new file mode 100644 index 0000000..cd99e33 --- /dev/null +++ b/ocl_brute.c @@ -0,0 +1,77 @@ + +#include +#include "utils.h" +#include "crypto.h" +#include "ocl.h" + +void ocl_brute() { + TimeHP t0, t1; long long td; + + cl_int err; + cl_platform_id platform_id; + cl_device_id device_id; + ocl_get_device(&platform_id, &device_id); + if (platform_id == NULL || device_id == NULL) { + return; + } + + + cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err)); + cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err)); + + const char *source_names[] = { + "cl/common.h", + "cl/aes_tables.cl", + "cl/aes_128.cl", + "cl/dsi.cl", + "cl/kernel_console_id.cl" }; + cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *), + source_names, context, device_id, NULL /* "-w -Werror" */); + + cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, "test_console_id", &err)); + + cl_ulong xor_l, xor_h, console_id_template = 0x08a1522617110100ull, out; + cl_uint ctr0, ctr1, ctr2, ctr3; + cl_int success = 0; + + cl_mem mem_success = + OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &err)); + cl_mem mem_out = + OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_ulong), NULL, &err)); + + OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_success, CL_TRUE, 0, sizeof(cl_int), &success, 0, NULL, NULL)); + + OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_ulong), &xor_l)); + OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &xor_h)); + OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &console_id_template)); + OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_uint), &ctr0)); + OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_uint), &ctr1)); + OCL_ASSERT(clSetKernelArg(kernel, 5, sizeof(cl_uint), &ctr2)); + OCL_ASSERT(clSetKernelArg(kernel, 6, sizeof(cl_uint), &ctr3)); + OCL_ASSERT(clSetKernelArg(kernel, 7, sizeof(cl_mem), &mem_success)); + OCL_ASSERT(clSetKernelArg(kernel, 8, sizeof(cl_mem), &mem_out)); + + size_t local; + OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL)); + printf("local work size: %u\n", (unsigned)local); + + size_t num_items = 0x100; + + get_hp_time(&t0); + OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL)); + clFinish(command_queue); + get_hp_time(&t1); td = hp_time_diff(&t0, &t1); + + OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_success, CL_TRUE, 0, sizeof(cl_int), &success, 0, NULL, NULL)); + if (success) { + // if success, the speed measurement is invalid + printf("got a hit in %d microseconds\n", (int)td); + OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL)); + printf("%08x%08x\n", (unsigned)(out >> 32), (unsigned)(out|0xffffffffu)); + } else { + printf("%d microseconds, %.2f M/s\n", (int)td, num_items * 1.0f / td); + printf("sorry, no hit\n"); + } + + clReleaseKernel(kernel); +} \ No newline at end of file diff --git a/ocl_test.c b/ocl_test.c new file mode 100644 index 0000000..9e787a0 --- /dev/null +++ b/ocl_test.c @@ -0,0 +1,197 @@ + +#include +#include "utils.h" +#include "ocl.h" +#include "crypto.h" + +#define BLOCK_SIZE 0x10 +#define NUM_BLOCKS (1 << 23) +#define BLOCKS_PER_ITEM 1 +#define NUM_ITEMS (NUM_BLOCKS / BLOCKS_PER_ITEM) +#define BUF_SIZE (BLOCK_SIZE * NUM_BLOCKS) + +void ocl_test_run_and_read(const char * test_name, cl_kernel kernel, + cl_device_id device_id, cl_command_queue command_queue, + cl_mem mem_out, cl_uchar *buf_out) +{ + printf("%s on %u MB\n", test_name, (unsigned)BUF_SIZE >> 20); + TimeHP t0, t1; long long td; + size_t local; + OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL)); + printf("local work size: %u\n", (unsigned)local); + + size_t num_items = NUM_ITEMS; + + get_hp_time(&t0); + // apparently, setting local work size to NULL doesn't affect performance, at least in this kind of work + OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL)); + // OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, NULL, 0, NULL, NULL)); + clFinish(command_queue); + get_hp_time(&t1); td = hp_time_diff(&t0, &t1); + printf("%d microseconds for OpenCL, %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td); + + get_hp_time(&t0); + OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, BUF_SIZE, buf_out, 0, NULL, NULL)); + get_hp_time(&t1); td = hp_time_diff(&t0, &t1); + printf("%d microseconds for data download, %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td); + + clReleaseKernel(kernel); +} + +void verify(const char *test_name, cl_uchar *buf_in, cl_uchar *buf_out, cl_uchar *buf_verify){ + if (memcmp(buf_verify, buf_out, BUF_SIZE)) { + printf("%s: verification failed\n", test_name); + unsigned count = 5; + for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) { + if (memcmp(buf_verify + offset, buf_out + offset, BLOCK_SIZE)) { + printf("difference @ 0x%08x/0x%08x:\n", offset, (unsigned)NUM_BLOCKS); + printf("\t%s\n", hexdump(buf_in + offset, BLOCK_SIZE, 0)); + printf("\t%s\n", hexdump(buf_out + offset, BLOCK_SIZE, 0)); + printf("\t%s\n", hexdump(buf_verify + offset, BLOCK_SIZE, 0)); + if (!--count) { + break; + } + } + } + } else { + printf("%s: succeed\n", test_name); + } + +} + +void ocl_test() { + TimeHP t0, t1; long long td; + + cl_int err; + cl_platform_id platform_id; + cl_device_id device_id; + ocl_get_device(&platform_id, &device_id); + if (platform_id == NULL || device_id == NULL) { + return; + } + + cl_uchar *buf_in = malloc(BUF_SIZE); + cl_uchar *buf_out = malloc(BUF_SIZE); + cl_uchar *buf_verify = malloc(BUF_SIZE); + + srand(2501); + cl_uchar key[16]; + unsigned int aes_rk[RK_LEN]; + aes_gen_tables(); + for (unsigned i = 0; i < 16; ++i) { + key[i] = rand() & 0xff; + } + printf("AES Key: %s\n", hexdump(key, 16, 0)); + get_hp_time(&t0); + if(cpu_has_rdrand()){ + // ~190 MB/s @ X230, ~200 without the success check + printf("randomize source buffer using RDRAND\n"); + if (!rdrand_fill((cl_ulong*)buf_in, BUF_SIZE >> 3)) { + printf("RDRND failed\n"); + exit(-1); + } + }else { + printf("randomize source buffer using AES OFB\n"); + // rand() & 0xff is about ~60 MB/s @ X230 + // it's worse than that AES single thread C, so OFB it is + // ~240 MB/s, even faster than RDRAND + unsigned int aes_rk[RK_LEN]; + unsigned char key_iv[16 * 2]; + for (unsigned i = 0; i < 16 * 2; ++i) { + key_iv[i] = rand() & 0xff; + } + aes_set_key_enc_128(aes_rk, key_iv); + aes_encrypt_128(aes_rk, key_iv + 16, buf_in); + unsigned char *p_in = buf_in, *p_out = buf_in + 16, + *p_end = buf_in + BUF_SIZE; + while (p_out < p_end) { + aes_encrypt_128(aes_rk, p_in, p_out); + p_in = p_out; + p_out += 16; + } + } + get_hp_time(&t1); td = hp_time_diff(&t0, &t1); + printf("%d microseconds for preparing test data, %.2f MB/s\n", + (int)td, BUF_SIZE * 1.0f / td); + + cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err)); + cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err)); + + const char *source_names[] = { + "cl/common.h", + "cl/sha1_16.cl", + "cl/aes_tables.cl", + "cl/aes_128.cl", + "cl/kernel_tests.cl" }; + char options[0x100]; + sprintf(options, "-w -Werror -DBLOCKS_PER_ITEM=%u", BLOCKS_PER_ITEM); + cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *), + source_names, context, device_id, options); + + // create buffer and upload data + cl_mem mem_key = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, 16, NULL, &err)); + cl_mem mem_in = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, BUF_SIZE, NULL, &err)); + cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, BUF_SIZE, NULL, &err)); + + OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_key, CL_TRUE, 0, 16, key, 0, NULL, NULL)); + get_hp_time(&t0); + OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_in, CL_TRUE, 0, BUF_SIZE, buf_in, 0, NULL, NULL)); + get_hp_time(&t1); td = hp_time_diff(&t0, &t1); + printf("%d microseconds for data upload, %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td); + + // SHA1_16 test + const char * test_name = "sha1_16_test"; + + cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err)); + + OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_in)); + OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_out)); + + ocl_test_run_and_read(test_name, kernel, device_id, command_queue, mem_out, buf_out); + + get_hp_time(&t0); + for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) { + sha1_16(buf_in + offset, buf_verify + offset); + } + get_hp_time(&t1); td = hp_time_diff(&t0, &t1); + printf("%d microseconds for C(single thread), %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td); + + verify(test_name, buf_in, buf_out, buf_verify); + + // AES 128 ECB test + test_name = "aes_128_ecb_test"; + + kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err)); + + OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_key)); + OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_in)); + OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_out)); + + ocl_test_run_and_read(test_name, kernel, device_id, command_queue, mem_out, buf_out); + /* + if(test_case == TEST_AES_128_ECB){ + dump_to_file("r:/test_aes_in.bin", buf_in, io_buf_len); + dump_to_file("r:/test_aes_out.bin", buf_out, io_buf_len); + } + */ + get_hp_time(&t0); + for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) { + // setting the same key over and over is stupid + // yet we still do it to make the results comparable + aes_set_key_enc_128(aes_rk, key); + aes_encrypt_128(aes_rk, buf_in + offset, buf_verify + offset); + } + get_hp_time(&t1); td = hp_time_diff(&t0, &t1); + printf("%d microseconds for C(single thread), %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td); + + verify(test_name, buf_in, buf_out, buf_verify); + + // cleanup + free(buf_in); free(buf_out); free(buf_verify); + clReleaseMemObject(mem_in); + clReleaseMemObject(mem_out); + clReleaseMemObject(mem_key); + clReleaseProgram(program); + clReleaseCommandQueue(command_queue); + clReleaseContext(context); +} diff --git a/ocl_util.c b/ocl_util.c index 36d131f..e44071c 100644 --- a/ocl_util.c +++ b/ocl_util.c @@ -4,6 +4,7 @@ #include #include #include "ocl.h" +#include "utils.h" #define STATIC_ASSERT(c) static_assert(c, #c) STATIC_ASSERT(sizeof(char) == sizeof(cl_char)); @@ -192,3 +193,42 @@ void ocl_get_device(cl_platform_id *p_platform_id, cl_device_id *p_device_id) { *p_device_id = NULL; } } + +cl_program ocl_build_from_sources( + unsigned num_sources, const char *source_names[], + cl_context context, cl_device_id device_id, const char * options) +{ + TimeHP t0, t1; + cl_int err; + // read sources + char **sources = malloc(sizeof(char*) * num_sources); + size_t *source_sizes = malloc(sizeof(size_t) * num_sources); + read_files(num_sources, source_names, sources, source_sizes); + + // compile + get_hp_time(&t0); + // WTF? GCC complains if I pass char ** to a function expecting const char **? + cl_program program = OCL_ASSERT2(clCreateProgramWithSource(context, num_sources, + (const char **)sources, source_sizes, &err)); + // printf("compiler options: %s\n", options); + err = clBuildProgram(program, 0, NULL, options, NULL, NULL); + get_hp_time(&t1); + printf("%d microseconds for compile\n", (int)hp_time_diff(&t0, &t1)); + if (err != CL_SUCCESS) { + fprintf(stderr, "failed to build program, error: %s, build log:\n", ocl_err_msg(err)); + size_t len; + clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len); + char *buf_log = malloc(len + 1); + clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, len, buf_log, NULL); + buf_log[len] = '\0'; + fprintf(stderr, "%s\n", buf_log); + free(buf_log); + exit(err); + } + for (unsigned i = 0; i < num_sources; ++i) { + free(sources[i]); + } + free(sources); + free(source_sizes); + return program; +} diff --git a/utils.c b/utils.c index 84014ab..8e55b87 100644 --- a/utils.c +++ b/utils.c @@ -3,8 +3,15 @@ #include #include #include +#include #include "utils.h" +#ifdef __GNUC__ +#include +#elif _MSC_VER +#include +#endif + int htoi(char a){ if(a >= '0' && a <= '9'){ return a - '0'; @@ -42,7 +49,7 @@ int hex2bytes(unsigned char *out, unsigned byte_len, const char *in, int critica #endif static char hexdump_buf[HEXDUMP_BUF_SIZE]; -// CAUTION, this always assume you have a buffer big enough +// CAUTION, this always assume the buffer is big enough const char *hexdump(const void *b, unsigned l, int space){ const unsigned char *p = (unsigned char*)b; char *out = hexdump_buf; @@ -80,4 +87,61 @@ long long hp_time_diff(struct timeval *pt0, struct timeval *pt1) { return diff; } -#endif \ No newline at end of file +#endif + +// CAUTION: caller is responsible to free the buf +char * read_file(const char *file_name, size_t *p_size) { + FILE * f = fopen(file_name, "rb"); + if (f == NULL) { + fprintf(stderr, "can't read file: %s", file_name); + exit(-1); + } + fseek(f, 0, SEEK_END); + *p_size = ftell(f); + char * buf = malloc(*p_size); + fseek(f, 0, SEEK_SET); + fread(buf, *p_size, 1, f); + fclose(f); + return buf; +} + +void read_files(unsigned num_files, const char *file_names[], char *ptrs[], size_t sizes[]) { + for (unsigned i = 0; i < num_files; ++i) { + ptrs[i] = read_file(file_names[i], &sizes[i]); + } +} + +void dump_to_file(const char *file_name, const void *buf, size_t len) { + FILE *f = fopen(file_name, "wb"); + if (f == NULL) { + fprintf(stderr, "can't open file to write: %s\n", file_name); + return; + } + fwrite(buf, len, 1, f); + fclose(f); +} + +int cpu_has_rdrand() { +#if __GNUC__ + unsigned a = 0, b = 0, c = 0, d = 0; + __get_cpuid(1, &a, &b, &c, &d); + return c & bit_RDRND; +#elif _MSC_VER + int regs[4]; + __cpuid(regs, 1); + return regs[2] & (1<<30); +#else + // ICL only? + return _may_i_use_cpu_feature(_FEATURE_RDRND); +#endif +} + +// input must be multiple of uint64_t +int rdrand_fill(unsigned long long *p, size_t size) { + unsigned long long *p_end = p + size; + int success = 1; + while (p < p_end) { + success &= _rdrand64_step(p++); + } + return success; +} \ No newline at end of file diff --git a/utils.h b/utils.h index 5e62027..1e3fb65 100644 --- a/utils.h +++ b/utils.h @@ -5,19 +5,29 @@ #ifdef _WIN32 #include -typedef LARGE_INTEGER HP_Time; +typedef LARGE_INTEGER TimeHP; #define get_hp_time QueryPerformanceCounter #else #include -typedef struct timeval HP_Time; -void get_hp_time(HP_Time *pt); +typedef struct timeval TimeHP; +void get_hp_time(TimeHP *pt); #endif -long long hp_time_diff(HP_Time *pt0, HP_Time *pt1); +long long hp_time_diff(TimeHP *pt0, TimeHP *pt1); int hex2bytes(unsigned char *out, unsigned byte_len, const char *in, int critical); const char * hexdump(const void *a, unsigned l, int space); + +char * read_file(const char *file_name, size_t *p_size); + +void read_files(unsigned num_files, const char *file_names[], char *ptrs[], size_t sizes[]); + +void dump_to_file(const char *file_name, const void *buf, size_t len); + +int cpu_has_rdrand(); + +int rdrand_fill(unsigned long long *p, size_t size);