Skip to content

Commit

Permalink
Merge branch 'master' of github.com:Jimmy-Z/bfCL
Browse files Browse the repository at this point in the history
  • Loading branch information
Jimmy-Z committed Aug 28, 2017
2 parents af7c839 + 6984c6f commit e64a854
Show file tree
Hide file tree
Showing 10 changed files with 631 additions and 483 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.*
*.o
*.exe
*.sln
*.vcxproj*
x64/
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# only tested in mingw
PNAME = bfcl
OBJS = $(PNAME).o ocl_util.o utils.o sha1_16.o aes128.o
CFLAGS += -std=c11 -Wall -O2 -I$(INTELOCLSDKROOT)/include
OBJS = $(PNAME).o ocl_util.o utils.o sha1_16.o aes_128.o
CFLAGS += -std=c11 -Wall -O2 -mrdrnd -I$(INTELOCLSDKROOT)/include
LDFLAGS += -L$(INTELOCLSDKROOT)/lib/x64

all : $(PNAME)
Expand Down
26 changes: 12 additions & 14 deletions aes128.c → aes_128.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,19 @@
#define PUT_UINT32_LE(n, b, i) \
*(uint32_t*)(b + i) = (n)

// packed into a struct to be easier to pass to OpenCL
// it's interesting they mix unsigned char with uint32_t
AES_Tables AES_tables;
// and aliases
static unsigned char *FSb = AES_tables.FSb;
static uint32_t *FT0 = AES_tables.FT0;
static uint32_t *FT1 = AES_tables.FT1;
static uint32_t *FT2 = AES_tables.FT2;
static uint32_t *FT3 = AES_tables.FT3;
static unsigned char *RSb = AES_tables.RSb;
static uint32_t *RT0 = AES_tables.RT0;
static uint32_t *RT1 = AES_tables.RT1;
static uint32_t *RT2 = AES_tables.RT2;
static uint32_t *RT3 = AES_tables.RT3;
static uint32_t *RCON = AES_tables.RCON;
static unsigned char FSb[256];
static uint32_t FT0[256];
static uint32_t FT1[256];
static uint32_t FT2[256];
static uint32_t FT3[256];
static unsigned char RSb[256];
static uint32_t RT0[256];
static uint32_t RT1[256];
static uint32_t RT2[256];
static uint32_t RT3[256];

static uint32_t RCON[256];

/*
* Tables generation code
Expand Down
210 changes: 142 additions & 68 deletions bfcl.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,63 +2,100 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <immintrin.h>
#include "ocl.h"
#include "crypto.h"
#include "utils.h"

extern AES_Tables AES_tables;
#ifdef __GNUC__
#include <cpuid.h>
#elif _MSC_VER
#include <intrin.h>
#endif

int cpu_has_rdrand() {
#if __GNUC__
unsigned a = 0, b = 0, c = 0, d = 0;
__get_cpuid(1, &a, &b, &c, &d);
return c & bit_RDRND;
#elif _MSC_VER
int regs[4];
__cpuid(regs, 1);
return regs[2] & (1<<30);
#else
// ICL only?
return _may_i_use_cpu_feature(_FEATURE_RDRND);
#endif
}

// CAUTION: caller is responsible to free the buf
char * read_source(const char *file_name) {
char * read_file(const char *file_name, size_t *p_size) {
FILE * f = fopen(file_name, "rb");
if (f == NULL) {
printf("can't read file: %s", file_name);
exit(-1);
}
fseek(f, 0, SEEK_END);
long size = ftell(f);
char * buf = malloc(size + 1);
*p_size = ftell(f);
char * buf = malloc(*p_size);
fseek(f, 0, SEEK_SET);
fread(buf, size, 1, f);
fread(buf, *p_size, 1, f);
fclose(f);
buf[size] = '\0';
return buf;
}

#define TEST_SHA1_16 1
#define TEST_AES_128_ECB 2
void read_files(unsigned num_files, const char *file_names[], char *ptrs[], size_t sizes[]) {
for (unsigned i = 0; i < num_files; ++i) {
ptrs[i] = read_file(file_names[i], &sizes[i]);
}
}

void ocl_test(int test_case) {
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
void dump_to_file(const char *file_name, const void *buf, size_t len) {
FILE *f = fopen(file_name, "wb");
if (f == NULL) {
printf("can't open file to write: %s\n", file_name);
return;
}
fwrite(buf, len, 1, f);
fclose(f);
}

#define TEST_SHA1_16 1
#define TEST_AES_128_ECB 2

#define BLOCK_SIZE 0x10
#define NUM_BLOCKS (1 << 23)
#define BLOCKS_PER_ITEM 1

void ocl_test(cl_device_id device_id, const cl_uchar *buf_in, int test_case) {
cl_int err;
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));

HP_Time t0, t1;
long long td;

#define BLOCK_SIZE 0x10
#define NUM_BLOCKS (1 << 24)
#define BLOCKS_PER_ITEM 1

const size_t num_items = NUM_BLOCKS / BLOCKS_PER_ITEM;
const size_t io_buf_len = NUM_BLOCKS * BLOCK_SIZE;

char *source = read_source("kernel.c");
const char *source_names[] = { "cl/sha1_16.cl", "cl/aes_tables.cl", "cl/aes_128.cl", "cl/kernels.cl" };
const unsigned num_sources = sizeof(source_names) / sizeof(char *);
char *sources[sizeof(source_names) / sizeof(char *)];
size_t source_sizes[sizeof(source_names) / sizeof(char *)];
read_files(num_sources, source_names, sources, source_sizes);

get_hp_time(&t0);
cl_program program = OCL_ASSERT2(clCreateProgramWithSource(context, 1, (const char**)&source, NULL, &err));
// WTF? GCC complains if I pass char ** in to a function expecting const char **?
cl_program program = OCL_ASSERT2(clCreateProgramWithSource(context, num_sources, (const char **)sources, source_sizes, &err));
char options[0x100];
sprintf(options, "-DBLOCKS_PER_ITEM=%d", BLOCKS_PER_ITEM);
printf("compiler options: %s\n", options);
sprintf(options, "-w -Werror -DBLOCKS_PER_ITEM=%d", BLOCKS_PER_ITEM);
// printf("compiler options: %s\n", options);
err = clBuildProgram(program, 0, NULL, options, NULL, NULL);
get_hp_time(&t1);
printf("%d microseconds for compile\n", (int)hp_time_diff(&t0, &t1));
free(source);
for (unsigned i = 0; i < num_sources; ++i) {
free(sources[i]);
}
if (err != CL_SUCCESS) {
fprintf(stderr, "failed to build program, error: %s, build log:\n", ocl_err_msg(err));
size_t len;
Expand All @@ -78,58 +115,48 @@ void ocl_test(int test_case) {
default: exit(-1);
}

printf("%s on %u bytes\n", test_name, (unsigned)io_buf_len);
printf("%s on %u MB\n", test_name, (unsigned)io_buf_len >> 20);

cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err));

cl_uchar *buf_in = malloc(io_buf_len);
cl_uchar key[16];
unsigned int aes_rk[RK_LEN];
get_hp_time(&t0);
srand(2501);
for (unsigned i = 0; i < io_buf_len; ++i) {
buf_in[i] = ((unsigned)rand() << 8) / RAND_MAX;
}
if (test_case == TEST_AES_128_ECB) {
aes_gen_tables();
for (unsigned i = 0; i < 16; ++i) {
key[i] = ((unsigned)rand() << 8) / RAND_MAX;
key[i] = rand() & 0xff;
}
aes_set_key_enc_128(aes_rk, key);
printf("Key: %s\n", hexdump(key, 16, 0));
}
get_hp_time(&t1);
printf("%d microseconds for preparing test data\n", (int)hp_time_diff(&t0, &t1));

cl_mem mem_in = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, io_buf_len, NULL, &err));
cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, io_buf_len, NULL, &err));
cl_mem mem_tables, mem_key;
cl_mem mem_key;
if (test_case == TEST_AES_128_ECB) {
mem_tables = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(AES_Tables), NULL, &err));
mem_key = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, 16, NULL, &err));
}

get_hp_time(&t0);
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_in, CL_TRUE, 0, io_buf_len, buf_in, 0, NULL, NULL));
if (test_case == TEST_AES_128_ECB) {
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_tables, CL_TRUE, 0, sizeof(AES_Tables), &AES_tables, 0, NULL, NULL));
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_key, CL_TRUE, 0, 16, key, 0, NULL, NULL));
}
get_hp_time(&t1);
printf("%d microseconds for data upload\n", (int)hp_time_diff(&t0, &t1));
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for data upload, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td);

OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_in));
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_out));
if (test_case == TEST_AES_128_ECB) {
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_tables));
OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_mem), &mem_key));
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_key));
}

size_t local;
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
printf("local work size: %u\n", (unsigned)local);

get_hp_time(&t0);
// appearantly, settiing local work size to NULL doesn't affect performance, at least in this kind of work
// apparently, setting local work size to NULL doesn't affect performance, at least in this kind of work
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
// OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, NULL, 0, NULL, NULL));
clFinish(command_queue);
Expand All @@ -141,28 +168,42 @@ void ocl_test(int test_case) {
get_hp_time(&t0);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, io_buf_len, buf_out, 0, NULL, NULL));
get_hp_time(&t1);
printf("%d microseconds for data download\n", (int)hp_time_diff(&t0, &t1));
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for data download, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td);

/*
if(test_case == TEST_AES_128_ECB){
dump_to_file("r:/test_aes_in.bin", buf_in, io_buf_len);
dump_to_file("r:/test_aes_out.bin", buf_out, io_buf_len);
}
*/

cl_uchar *buf_verify = malloc(io_buf_len);
get_hp_time(&t0);
if (test_case == TEST_SHA1_16) {
for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) {
sha1_16(buf_in + offset, buf_in + offset);
sha1_16(buf_in + offset, buf_verify + offset);
}
} else {
for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) {
aes_encrypt_128(aes_rk, buf_in + offset, buf_in + offset);
// setting the same key over and over is stupid
// yet we still do it to keep it in line with the OpenCL port
// otherwise we can't test the set key in OpenCL
aes_set_key_enc_128(aes_rk, key);
aes_encrypt_128(aes_rk, buf_in + offset, buf_verify + offset);
}
}
get_hp_time(&t1);
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for C(single thread), %.3f MB/s\n", (int)td, io_buf_len * 1.0f / td);
printf("%d microseconds for C(single thread), %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td);

if (memcmp(buf_in, buf_out, io_buf_len)) {
if (memcmp(buf_verify, buf_out, io_buf_len)) {
printf("%s: verification failed\n", test_name);
for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) {
if (memcmp(buf_in + offset, buf_out + offset, BLOCK_SIZE)) {
if (memcmp(buf_verify + offset, buf_out + offset, BLOCK_SIZE)) {
printf("first difference @ 0x%08x/0x%08x:\n", offset, (unsigned)num_items );
printf("\t%s\n", hexdump(buf_in + offset, BLOCK_SIZE, 0));
printf("\t%s\n", hexdump(buf_verify + offset, BLOCK_SIZE, 0));
printf("\t%s\n", hexdump(buf_out + offset, BLOCK_SIZE, 0));
break;
}
Expand All @@ -171,7 +212,7 @@ void ocl_test(int test_case) {
printf("%s: succeed\n", test_name);
}

free(buf_in); free(buf_out);
free(buf_out);
clReleaseMemObject(mem_in);
clReleaseMemObject(mem_out);
clReleaseProgram(program);
Expand All @@ -180,32 +221,65 @@ void ocl_test(int test_case) {
clReleaseContext(context);
}

void aes128_test() {
// TODO: test against OpenSSL results
unsigned char test_key[16] = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
unsigned char test_src[32] = { 8, 7, 6, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5, 6, 7, 8,
1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1 };
unsigned char test_out[32];

aes_gen_tables();

unsigned aes_rk[RK_LEN];

aes_set_key_enc_128(aes_rk, test_key);
aes_encrypt_128(aes_rk, test_src, test_out);
aes_encrypt_128(aes_rk, test_src + 16, test_out + 16);
puts(hexdump(test_out, 32, 1));
}

int main(int argc, const char *argv[]) {
if (argc == 2 && !strcmp(argv[1], "info")) {
cl_uint num_platforms;
ocl_info(&num_platforms, 1);
}else if (argc == 2 && !strcmp(argv[1], "aes128ecb")) {
aes128_test();
} else if (argc == 1){
ocl_test(TEST_SHA1_16);
ocl_test(TEST_AES_128_ECB);
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
return -1;
}

cl_uchar *buf_in = malloc(BLOCK_SIZE * NUM_BLOCKS);
srand(2501);
HP_Time t0, t1; long long td;
get_hp_time(&t0);
if(cpu_has_rdrand()){
// ~190 MB/s @ X230, ~200 without the success check
printf("randomize source buffer using RDRAND\n");
unsigned long long *p = (unsigned long long *)buf_in;
unsigned long long *p_end = (unsigned long long *)(buf_in + BLOCK_SIZE * NUM_BLOCKS);
int success = 1;
while (p < p_end) {
success &= _rdrand64_step(p++);
}
if (!success) {
printf("RDRND failed\n");
exit(-1);
}
}else {
printf("randomize source buffer using AES OFB\n");
// rand() & 0xff is about ~60 MB/s @ X230
// it's worse than that AES single thread C, so OFB it is
// ~240 MB/s, even faster than RDRAND
srand(2501);
unsigned int aes_rk[RK_LEN];
unsigned char key_iv[16 * 2];
for (unsigned i = 0; i < 16 * 2; ++i) {
key_iv[i] = rand() & 0xff;
}
aes_set_key_enc_128(aes_rk, key_iv);
aes_encrypt_128(aes_rk, key_iv + 16, buf_in);
unsigned char *p_in = buf_in, *p_out = buf_in + 16,
*p_end = buf_in + BLOCK_SIZE * NUM_BLOCKS;
while (p_out < p_end) {
aes_encrypt_128(aes_rk, p_in, p_out);
p_in = p_out;
p_out += 16;
}
}
get_hp_time(&t1);
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for preparing test data, %.2f MB/s\n",
(int)td, BLOCK_SIZE * NUM_BLOCKS * 1.0f / td);

ocl_test(device_id, buf_in, TEST_SHA1_16);
ocl_test(device_id, buf_in, TEST_AES_128_ECB);

free(buf_in);
#ifdef _WIN32
system("pause");
#endif
Expand Down
Loading

0 comments on commit e64a854

Please sign in to comment.