From 54f57547d3f706a6b443361aa40f5c115cb94a7c Mon Sep 17 00:00:00 2001 From: tonyjie Date: Sun, 27 Oct 2024 19:39:30 -0400 Subject: [PATCH 1/2] feature: enable GPU to accelerate odgi-layout with massive speedup. Build with a flag USE_GPU --- CMakeLists.txt | 39 ++- src/algorithms/path_sgd_layout.cpp | 39 ++- src/algorithms/path_sgd_layout.hpp | 26 +- src/cuda/layout.cu | 475 +++++++++++++++++++++++++++++ src/cuda/layout.h | 82 +++++ src/subcommand/layout_main.cpp | 83 +++-- 6 files changed, 718 insertions(+), 26 deletions(-) create mode 100644 src/cuda/layout.cu create mode 100644 src/cuda/layout.h diff --git a/CMakeLists.txt b/CMakeLists.txt index dc6e9b444..dd781923d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ cmake_minimum_required(VERSION 3.16) # Project's name -project(odgi) +project(odgi LANGUAGES CXX) # Enforce c++17 set(CMAKE_CXX_STANDARD 17) @@ -30,6 +30,8 @@ set(CMAKE_CXX_STANDARD 17) option(PIC "Compile all odgi sources with -fPIC - required for shared libs" ON) option(ASAN "Use address sanitiser" OFF) option(INLINE_HANDLEGRAPH_SOURCES "Compile handlegraph sources inline" OFF) +# Add the GPU option (default is OFF) +option(USE_GPU "Enable GPU support if available" OFF) include(ExternalProject) include(FeatureSummary) @@ -37,6 +39,18 @@ include(FeatureSummary) find_package(PkgConfig REQUIRED) find_package(pybind11 CONFIG) find_package(OpenMP) +# Find CUDA if GPU option is enabled +if (USE_GPU) + find_package(CUDA REQUIRED) # Adjust this if you're using modern CMake with FindCUDAToolkit. + if(CUDA_FOUND) + enable_language(CUDA) + message(STATUS "CUDA found. GPU support enabled.") + else() + message(FATAL_ERROR "CUDA not found! Cannot enable GPU support.") + endif() +else() + message(STATUS "Building with CPU-only support.") +endif() feature_summary( FATAL_ON_MISSING_REQUIRED_PACKAGES @@ -621,6 +635,9 @@ add_library(odgi_objs OBJECT ${lodepng_SOURCES} ${handlegraph_sources} ) +if (USE_GPU) + target_sources(odgi_objs PRIVATE "${CMAKE_SOURCE_DIR}/src/cuda/layout.cu") +endif (USE_GPU) set(odgi_DEPS # lodepng @@ -687,6 +704,9 @@ set(odgi_INCLUDES "${xoshiro_INCLUDE}" "${atomicbitvector_INCLUDE}" "${mio_INCLUDE}") +if (USE_GPU) + list(APPEND odgi_INCLUDES "${CUDA_INCLUDE_DIRS}") +endif (USE_GPU) set(odgi_LIBS jemalloc @@ -803,9 +823,26 @@ set(odgi_HEADERS ${CMAKE_SOURCE_DIR}/src/algorithms/path_length.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/path_keep.hpp ${CMAKE_SOURCE_DIR}/src/algorithms/diffpriv.cpp) +if (USE_GPU) + list(APPEND odgi_HEADERS "${CMAKE_SOURCE_DIR}/src/cuda/layout.h") +endif (USE_GPU) target_include_directories(odgi_objs PUBLIC ${odgi_INCLUDES}) +if (USE_GPU) + include(FindCUDA/select_compute_arch) + CUDA_DETECT_INSTALLED_GPUS(INSTALLED_GPU_CCS_1) + string(STRIP "${INSTALLED_GPU_CCS_1}" INSTALLED_GPU_CCS_2) + string(REPLACE " " ";" INSTALLED_GPU_CCS_3 "${INSTALLED_GPU_CCS_2}") + string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_3}") + SET(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST}) + message(STATUS "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}") + # Apply compile options. Detects different GPU compute capability. + target_compile_options(odgi_objs PRIVATE $<$: -std=c++17 -Xcompiler=-fopenmp -lineinfo>) + # add USE_GPU macro when building with GPU + target_compile_definitions(odgi_objs PRIVATE USE_GPU) +endif (USE_GPU) + add_library(libodgi_static STATIC $) set_target_properties(libodgi_static PROPERTIES OUTPUT_NAME "odgi") set_target_properties(libodgi_static PROPERTIES PUBLIC_HEADER "${odgi_HEADERS}") diff --git a/src/algorithms/path_sgd_layout.cpp b/src/algorithms/path_sgd_layout.cpp index 6167ea806..4a4d4c4e4 100644 --- a/src/algorithms/path_sgd_layout.cpp +++ b/src/algorithms/path_sgd_layout.cpp @@ -150,7 +150,7 @@ namespace odgi { } else { eta.store(etas[iteration]); // update our learning rate Delta_max.store(delta); // set our delta max to the threshold - if (iteration > first_cooling_iteration) { + if (iteration >= first_cooling_iteration) { //std::cerr << std::endl << "setting cooling!!" << std::endl; adj_theta.store(0.001); cooling.store(true); @@ -466,6 +466,43 @@ namespace odgi { #endif return etas; } +#ifdef USE_GPU + void path_linear_sgd_layout_gpu(const PathHandleGraph &graph, + const xp::XP &path_index, + const std::vector &path_sgd_use_paths, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const uint64_t &min_term_updates, + const double &delta, + const double &eps, + const double &eta_max, + const double &theta, + const uint64_t &space, + const uint64_t &space_max, + const uint64_t &space_quantization_step, + const double &cooling_start, + const uint64_t &nthreads, + const bool &progress, + const bool &snapshot, + const std::string &snapshot_prefix, + std::vector> &X, + std::vector> &Y) { + cuda::layout_config_t config; + config.iter_max = iter_max; + config.min_term_updates = min_term_updates; + config.eta_max = eta_max; + config.eps = eps; + config.iter_with_max_learning_rate = (int32_t) iter_with_max_learning_rate; + config.first_cooling_iteration = std::floor(cooling_start * (double)iter_max); + config.theta = theta; + config.space = uint32_t(space); + config.space_max = uint32_t(space_max); + config.space_quantization_step = uint32_t(space_quantization_step); + config.nthreads = nthreads; + cuda::gpu_layout(config, dynamic_cast(graph), X, Y); + return; + } +#endif /* void deterministic_path_linear_sgd(const PathHandleGraph &graph, const xp::XP &path_index, diff --git a/src/algorithms/path_sgd_layout.hpp b/src/algorithms/path_sgd_layout.hpp index 268248419..294962769 100644 --- a/src/algorithms/path_sgd_layout.hpp +++ b/src/algorithms/path_sgd_layout.hpp @@ -19,6 +19,9 @@ #include "dirty_zipfian_int_distribution.h" #include "XoshiroCpp.hpp" #include "progress.hpp" +#ifdef USE_GPU +#include "cuda/layout.h" +#endif namespace odgi { namespace algorithms { @@ -53,7 +56,28 @@ namespace odgi { const uint64_t &iter_max, const uint64_t &iter_with_max_learning_rate, const double &eps); - +#ifdef USE_GPU + void path_linear_sgd_layout_gpu(const PathHandleGraph &graph, + const xp::XP &path_index, + const std::vector &path_sgd_use_paths, + const uint64_t &iter_max, + const uint64_t &iter_with_max_learning_rate, + const uint64_t &min_term_updates, + const double &delta, + const double &eps, + const double &eta_max, + const double &theta, + const uint64_t &space, + const uint64_t &space_max, + const uint64_t &space_quantization_step, + const double &cooling_start, + const uint64_t &nthreads, + const bool &progress, + const bool &snapshot, + const std::string &snapshot_prefix, + std::vector> &X, + std::vector> &Y); +#endif /// single threaded and deterministic path guided 1D linear SGD /* void deterministic_path_linear_sgd_layout(const PathHandleGraph &graph, diff --git a/src/cuda/layout.cu b/src/cuda/layout.cu new file mode 100644 index 000000000..52e574a27 --- /dev/null +++ b/src/cuda/layout.cu @@ -0,0 +1,475 @@ +#include "layout.h" +#include +#include +#include "cuda_runtime_api.h" + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + printf("Failed, NCCL error %s:%d '%s'\n", \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +namespace cuda { + +__global__ void cuda_device_init(curandState_t *rnd_state_tmp, curandStateCoalesced_t *rnd_state) { + int32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + // initialize curandState with original curand implementation + curand_init(42+tid, tid, 0, &rnd_state_tmp[tid]); + // copy to coalesced data structure + rnd_state[blockIdx.x].d[threadIdx.x] = rnd_state_tmp[tid].d; + rnd_state[blockIdx.x].w0[threadIdx.x] = rnd_state_tmp[tid].v[0]; + rnd_state[blockIdx.x].w1[threadIdx.x] = rnd_state_tmp[tid].v[1]; + rnd_state[blockIdx.x].w2[threadIdx.x] = rnd_state_tmp[tid].v[2]; + rnd_state[blockIdx.x].w3[threadIdx.x] = rnd_state_tmp[tid].v[3]; + rnd_state[blockIdx.x].w4[threadIdx.x] = rnd_state_tmp[tid].v[4]; +} + +/** + * @brief: Return 32-bits of pseudorandomness from an XORWOW generator. from "curand_kernel.h" + * For some use cases, we don't need floating point uniform distribution. So we don't need to call `curand_uniform_coalesced` as below. We shall use this function. + * \param state - Pointer to state to update + * \param thread_id - Thread id + * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use. +*/ +__device__ +unsigned int curand_coalesced(curandStateCoalesced_t *state, uint32_t thread_id) { + // Return 32-bits of pseudorandomness from an XORWOW generator. + uint32_t t; + t = (state->w0[thread_id] ^ (state->w0[thread_id] >> 2)); + state->w0[thread_id] = state->w1[thread_id]; + state->w1[thread_id] = state->w2[thread_id]; + state->w2[thread_id] = state->w3[thread_id]; + state->w3[thread_id] = state->w4[thread_id]; + state->w4[thread_id] = (state->w4[thread_id] ^ (state->w4[thread_id] << 4)) ^ (t ^ (t << 1)); + state->d[thread_id] += 362437; + return state->w4[thread_id] + state->d[thread_id]; +} + +__device__ +float curand_uniform_coalesced(curandStateCoalesced_t *state, uint32_t thread_id) { + // generate 32 bit pseudorandom value with XORWOW generator (see paper "Xorshift RNGs" by George Marsaglia); + // also used in curand library (see curand_kernel.h) + uint32_t t; + t = state->w0[thread_id] ^ (state->w0[thread_id] >> 2); + state->w0[thread_id] = state->w1[thread_id]; + state->w1[thread_id] = state->w2[thread_id]; + state->w2[thread_id] = state->w3[thread_id]; + state->w3[thread_id] = state->w4[thread_id]; + state->w4[thread_id] = (state->w4[thread_id] ^ (state->w4[thread_id] << 4)) ^ (t ^ (t << 1)); + state->d[thread_id] += 362437; + + uint32_t rnd_uint = state->d[thread_id] + state->w4[thread_id]; + + // convert to float; see curand_uniform.h + return _curand_uniform(rnd_uint); +} + + +__device__ double compute_zeta(uint32_t n, double theta) { + double ans = 0.0; + for (uint32_t i = 1; i <= n; i++) { + ans += pow(1.0 / double(i), theta); + } + return ans; +} + +// this function uses the cuda operation __powf, which is a faster but less precise alternative to the pow operation +__device__ uint32_t cuda_rnd_zipf(curandStateCoalesced_t *rnd_state, uint32_t n, double theta, double zeta2, double zetan) { + double alpha = 1.0 / (1.0 - theta); + double denominator = 1.0 - zeta2 / zetan; + if (denominator == 0.0) { + denominator = 1e-9; + } + double eta = (1.0 - __powf(2.0 / double(n), 1.0 - theta)) / (denominator); + + // INFO: curand_uniform generates random values between 0.0 (excluded) and 1.0 (included) + double u = 1.0 - curand_uniform_coalesced(rnd_state, threadIdx.x); + double uz = u * zetan; + + int64_t val = 0; + if (uz < 1.0) val = 1; + else if (uz < 1.0 + __powf(0.5, theta)) val = 2; + else val = 1 + int64_t(double(n) * __powf(eta * u - eta + 1.0, alpha)); + + if (val > n) { + //printf("WARNING: val: %ld, n: %u\n", val, uint32_t(n)); + val--; + } + assert(val >= 0); + assert(val <= n); + return uint32_t(val); +} + + +static __device__ __inline__ uint32_t __mysmid(){ + uint32_t smid; + asm volatile("mov.u32 %0, %%smid;" : "=r"(smid)); + return smid; +} + +/** +* @brief: update the coordinates of two visualization nodes in the 2D layout space +* This function is called multiple times in one `gpu_layout_kernel` in order to increase the data reuse. +* Each time, the warp shuffle intrinsics are used to change the selection of node 2 among the 32 threads in the warp. +* E.g. Iter : Step Pairs Selected would be: +* 1: (a0, b0), (a1, b1), (a2, b2), ..., (a31, b31) +* 2: (a0, b9), (a1, b0), (a2, b3), ..., (a31, b4) +* 3: (a0, b1), (a1, b4), (a2, b1), ..., (a31, b10) +* ... +* `b` is randomly chosen from the 32 threads in the warp. +* @param n1_pos_in_path: position of node 1 in the current selected path +* @param n1_id: id of node 1 +* @param n1_offset: offset of node 1 +* @param n2_pos_in_path: position of node 2 in the current selected path +* @param n2_id: id of node 2 +* @param n2_offset: offset of node 2 +* @param eta: an coefficient used in the update formula +* @param node_data: the data structure that stores the coordinates of all nodes +*/ +__device__ +void update_pos_gpu(int64_t &n1_pos_in_path, uint32_t &n1_id, int &n1_offset, + int64_t &n2_pos_in_path, uint32_t &n2_id, int &n2_offset, + double eta, + cuda::node_data_t &node_data) { + double term_dist = std::abs(static_cast(n1_pos_in_path) - static_cast(n2_pos_in_path)); + + if (term_dist < 1e-9) { + term_dist = 1e-9; + } + + double w_ij = 1.0 / term_dist; + + double mu = eta * w_ij; + if (mu > 1.0) { + mu = 1.0; + } + + float *x1 = &node_data.nodes[n1_id].coords[n1_offset]; + float *x2 = &node_data.nodes[n2_id].coords[n2_offset]; + float *y1 = &node_data.nodes[n1_id].coords[n1_offset + 1]; + float *y2 = &node_data.nodes[n2_id].coords[n2_offset + 1]; + double x1_val = double(*x1); + double x2_val = double(*x2); + double y1_val = double(*y1); + double y2_val = double(*y2); + + double dx = x1_val - x2_val; + double dy = y1_val - y2_val; + + if (dx == 0.0) { + dx = 1e-9; + } + + double mag = sqrt(dx * dx + dy * dy); + double delta = mu * (mag - term_dist) / 2.0; + //double delta_abs = std::abs(delta); + + // TODO implement delta max stop functionality + double r = delta / mag; + double r_x = r * dx; + double r_y = r * dy; + // TODO check current value before updating + atomicExch(x1, float(x1_val - r_x)); + atomicExch(x2, float(x2_val + r_x)); + atomicExch(y1, float(y1_val - r_y)); + atomicExch(y2, float(y2_val + r_y)); +} + +__global__ +void gpu_layout_kernel(int iter, cuda::layout_config_t config, curandStateCoalesced_t *rnd_state, double eta, double *zetas, + cuda::node_data_t node_data, cuda::path_data_t path_data, int sm_count) { + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + uint32_t smid = __mysmid(); + assert(smid < sm_count); + + curandStateCoalesced_t *thread_rnd_state = &rnd_state[smid]; + + __shared__ bool cooling[BLOCK_SIZE / WARP_SIZE]; + if (threadIdx.x % WARP_SIZE == 1) { + cooling[threadIdx.x / WARP_SIZE] = (iter >= config.first_cooling_iteration) || (curand_coalesced(thread_rnd_state, threadIdx.x) % 2 == 0); + } + + // select path + __shared__ uint32_t first_step_idx[BLOCK_SIZE / WARP_SIZE]; // BLOCK_SIZE/WARP_SIZE = 1024/32 = 32 + // each thread picks its own path + uint32_t step_idx = curand_coalesced(thread_rnd_state, threadIdx.x) % path_data.total_path_steps; + + uint32_t path_idx = path_data.element_array[step_idx].pidx; + path_t p = path_data.paths[path_idx]; + + if (p.step_count < 2) { + return; + } + assert(p.step_count > 1); + + // INFO: curand_uniform generates random values between 0.0 (excluded) and 1.0 (included) + uint32_t s1_idx = curand_coalesced(thread_rnd_state, threadIdx.x) % p.step_count; + assert(s1_idx < p.step_count); + uint32_t s2_idx; + + if (cooling[threadIdx.x / WARP_SIZE]) { + bool backward; + uint32_t jump_space; + if (s1_idx > 0 && (curand_coalesced(thread_rnd_state, threadIdx.x) % 2 == 0) || s1_idx == p.step_count-1) { + // go backward + backward = true; + jump_space = min(config.space, s1_idx); + } else { + // go forward + backward = false; + jump_space = min(config.space, p.step_count - s1_idx - 1); + } + uint32_t space = jump_space; + if (jump_space > config.space_max) { + space = config.space_max + (jump_space - config.space_max) / config.space_quantization_step + 1; + } + + uint32_t z_i = cuda_rnd_zipf(thread_rnd_state, jump_space, config.theta, zetas[2], zetas[space]); + + s2_idx = backward ? s1_idx - z_i : s1_idx + z_i; + } else { + do { + s2_idx = curand_coalesced(thread_rnd_state, threadIdx.x) % p.step_count; + } while (s1_idx == s2_idx); + } + assert(s1_idx < p.step_count); + assert(s2_idx < p.step_count); + assert(s1_idx != s2_idx); + + + uint32_t n1_id = p.elements[s1_idx].node_id; + int64_t n1_pos_in_path = p.elements[s1_idx].pos; + bool n1_is_rev = (n1_pos_in_path < 0)? true: false; + n1_pos_in_path = std::abs(n1_pos_in_path); + + uint32_t n2_id = p.elements[s2_idx].node_id; + int64_t n2_pos_in_path = p.elements[s2_idx].pos; + bool n2_is_rev = (n2_pos_in_path < 0)? true: false; + n2_pos_in_path = std::abs(n2_pos_in_path); + + uint32_t n1_seq_length = node_data.nodes[n1_id].seq_length; + bool n1_use_other_end = (curand_coalesced(thread_rnd_state, threadIdx.x) % 2 == 0) ? true : false; + if (n1_use_other_end) { + n1_pos_in_path += uint64_t{n1_seq_length}; + n1_use_other_end = !n1_is_rev; + } else { + n1_use_other_end = n1_is_rev; + } + + uint32_t n2_seq_length = node_data.nodes[n2_id].seq_length; + bool n2_use_other_end = (curand_coalesced(thread_rnd_state, threadIdx.x) % 2 == 0) ? true : false; + if (n2_use_other_end) { + n2_pos_in_path += uint64_t{n2_seq_length}; + n2_use_other_end = !n2_is_rev; + } else { + n2_use_other_end = n2_is_rev; + } + + int n1_offset = n1_use_other_end? 2: 0; + int n2_offset = n2_use_other_end? 2: 0; + + // Update Coordinates based on the data of selected nodes: n_pos_in_path, n_id, n_offset + update_pos_gpu(n1_pos_in_path, n1_id, n1_offset, + n2_pos_in_path, n2_id, n2_offset, + eta, node_data); +} + + +void gpu_layout(layout_config_t config, const odgi::graph_t &graph, std::vector> &X, std::vector> &Y) { + + + std::cout << "===== Use GPU to compute odgi-layout =====" << std::endl; + // get cuda device property, and get the SM count + cudaDeviceProp prop; + CUDACHECK(cudaGetDeviceProperties(&prop, 0)); + int sm_count = prop.multiProcessorCount; + + // create eta array + double *etas; + cudaMallocManaged(&etas, config.iter_max * sizeof(double)); + + const int32_t iter_max = config.iter_max; + const int32_t iter_with_max_learning_rate = config.iter_with_max_learning_rate; + const double w_max = 1.0; + const double eps = config.eps; + const double eta_max = config.eta_max; + const double eta_min = eps / w_max; + const double lambda = log(eta_max / eta_min) / ((double) iter_max - 1); + for (int32_t i = 0; i < config.iter_max; i++) { + double eta = eta_max * exp(-lambda * (std::abs(i - iter_with_max_learning_rate))); + etas[i] = isnan(eta)? eta_min : eta; + } + + // create node data structure + // consisting of sequence length and coords + uint32_t node_count = graph.get_node_count(); + assert(graph.min_node_id() == 1); + assert(graph.max_node_id() == node_count); + assert(graph.max_node_id() - graph.min_node_id() + 1 == node_count); + + cuda::node_data_t node_data; + node_data.node_count = node_count; + cudaMallocManaged(&node_data.nodes, node_count * sizeof(cuda::node_t)); + for (int node_idx = 0; node_idx < node_count; node_idx++) { + //assert(graph.has_node(node_idx)); + cuda::node_t *n_tmp = &node_data.nodes[node_idx]; + + // sequence length + const handlegraph::handle_t h = graph.get_handle(node_idx + 1, false); + // NOTE: unable store orientation (reverse), since this information is path dependent + n_tmp->seq_length = graph.get_length(h); + + // copy random coordinates + n_tmp->coords[0] = float(X[node_idx * 2].load()); + n_tmp->coords[1] = float(Y[node_idx * 2].load()); + n_tmp->coords[2] = float(X[node_idx * 2 + 1].load()); + n_tmp->coords[3] = float(Y[node_idx * 2 + 1].load()); + } + + + // create path data structure + uint32_t path_count = graph.get_path_count(); + cuda::path_data_t path_data; + path_data.path_count = path_count; + path_data.total_path_steps = 0; + cudaMallocManaged(&path_data.paths, path_count * sizeof(cuda::path_t)); + + vector path_handles{}; + path_handles.reserve(path_count); + graph.for_each_path_handle( + [&] (const odgi::path_handle_t& p) { + path_handles.push_back(p); + path_data.total_path_steps += graph.get_step_count(p); + }); + cudaMallocManaged(&path_data.element_array, path_data.total_path_steps * sizeof(path_element_t)); + + // get length and starting position of all paths + uint64_t first_step_counter = 0; + for (int path_idx = 0; path_idx < path_count; path_idx++) { + odgi::path_handle_t p = path_handles[path_idx]; + int step_count = graph.get_step_count(p); + path_data.paths[path_idx].step_count = step_count; + path_data.paths[path_idx].first_step_in_path = first_step_counter; + first_step_counter += step_count; + } + +#pragma omp parallel for num_threads(config.nthreads) + for (int path_idx = 0; path_idx < path_count; path_idx++) { + odgi::path_handle_t p = path_handles[path_idx]; + //std::cout << graph.get_path_name(p) << ": " << graph.get_step_count(p) << std::endl; + + uint32_t step_count = path_data.paths[path_idx].step_count; + uint64_t first_step_in_path = path_data.paths[path_idx].first_step_in_path; + if (step_count == 0) { + path_data.paths[path_idx].elements = NULL; + } else { + path_element_t *cur_path = &path_data.element_array[first_step_in_path]; + path_data.paths[path_idx].elements = cur_path; + + odgi::step_handle_t s = graph.path_begin(p); + int64_t pos = 1; + // Iterate through path + for (int step_idx = 0; step_idx < step_count; step_idx++) { + odgi::handle_t h = graph.get_handle_of_step(s); + //std::cout << graph.get_id(h) << std::endl; + + cur_path[step_idx].node_id = graph.get_id(h) - 1; + cur_path[step_idx].pidx = uint32_t(path_idx); + // store position negative when handle reverse + if (graph.get_is_reverse(h)) { + cur_path[step_idx].pos = -pos; + } else { + cur_path[step_idx].pos = pos; + } + pos += graph.get_length(h); + + // get next step + if (graph.has_next_step(s)) { + s = graph.get_next_step(s); + } else if (!(step_idx == step_count-1)) { + // should never be reached + std::cout << "Error: Here should be another step" << std::endl; + } + } + } + } + + // cache zipf zetas + auto start_zeta = std::chrono::high_resolution_clock::now(); + double *zetas; + uint64_t zetas_cnt = ((config.space <= config.space_max)? config.space : (config.space_max + (config.space - config.space_max) / config.space_quantization_step + 1)) + 1; + cudaMallocManaged(&zetas, zetas_cnt * sizeof(double)); + double zeta_tmp = 0.0; + for (uint64_t i = 1; i < config.space + 1; i++) { + zeta_tmp += dirtyzipf::fast_precise_pow(1.0 / i, config.theta); + if (i <= config.space_max) { + zetas[i] = zeta_tmp; + } + if (i >= config.space_max && (i - config.space_max) % config.space_quantization_step == 0) { + zetas[config.space_max + 1 + (i - config.space_max) / config.space_quantization_step] = zeta_tmp; + } + } + auto end_zeta = std::chrono::high_resolution_clock::now(); + uint32_t duration_zeta_ms = std::chrono::duration_cast(end_zeta - start_zeta).count(); + + const uint64_t block_size = BLOCK_SIZE; + uint64_t block_nbr = (config.min_term_updates + block_size - 1) / block_size; + + curandState_t *rnd_state_tmp; + curandStateCoalesced_t *rnd_state; + CUDACHECK(cudaMallocManaged(&rnd_state_tmp, sm_count * block_size * sizeof(curandState_t))); + CUDACHECK(cudaMallocManaged(&rnd_state, sm_count * sizeof(curandStateCoalesced_t))); + cuda_device_init<<>>(rnd_state_tmp, rnd_state); + CUDACHECK(cudaGetLastError()); + CUDACHECK(cudaDeviceSynchronize()); + cudaFree(rnd_state_tmp); + + for (int iter = 0; iter < config.iter_max; iter++) { + gpu_layout_kernel<<>>(iter, config, rnd_state, etas[iter], zetas, node_data, path_data, sm_count); + // check error + CUDACHECK(cudaGetLastError()); + CUDACHECK(cudaDeviceSynchronize()); + } + + // copy coords back to X, Y vectors + for (int node_idx = 0; node_idx < node_count; node_idx++) { + cuda::node_t *n = &(node_data.nodes[node_idx]); + // coords[0], coords[1], coords[2], coords[3] are stored consecutively. + float *coords = n->coords; + // check if coordinates valid (not NaN or infinite) + for (int i = 0; i < 4; i++) { + if (!isfinite(coords[i])) { + std::cout << "WARNING: invalid coordiate" << std::endl; + } + } + X[node_idx * 2].store(double(coords[0])); + Y[node_idx * 2].store(double(coords[1])); + X[node_idx * 2 + 1].store(double(coords[2])); + Y[node_idx * 2 + 1].store(double(coords[3])); + //std::cout << "coords of " << node_idx << ": [" << X[node_idx*2] << "; " << Y[node_idx*2] << "] ; [" << X[node_idx*2+1] << "; " << Y[node_idx*2+1] <<"]\n"; + } + + // free memory + cudaFree(etas); + cudaFree(node_data.nodes); + cudaFree(path_data.paths); + cudaFree(path_data.element_array); + cudaFree(zetas); + cudaFree(rnd_state); + + return; +} + +} \ No newline at end of file diff --git a/src/cuda/layout.h b/src/cuda/layout.h new file mode 100644 index 000000000..9de514ea1 --- /dev/null +++ b/src/cuda/layout.h @@ -0,0 +1,82 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "odgi.hpp" +#include "XoshiroCpp.hpp" +#include "dirty_zipfian_int_distribution.h" + +namespace cuda { + + +struct __align__(8) node_t { + float coords[4]; + int32_t seq_length; +}; +struct node_data_t { + uint32_t node_count; + node_t *nodes; +}; + + +struct __align__(8) path_element_t { + uint32_t pidx; + uint32_t node_id; + int64_t pos; // if position negative: reverse orientation +}; + +struct path_t { + uint32_t step_count; + uint64_t first_step_in_path; // precomputed position in path + path_element_t *elements; +}; + +struct path_data_t { + uint32_t path_count; + uint64_t total_path_steps; + path_t *paths; + path_element_t *element_array; +}; + + +// #define SM_COUNT 84 +// #define SM_COUNT 108 +#define BLOCK_SIZE 1024 +#define WARP_SIZE 32 +struct curandStateXORWOWCoalesced_t { + unsigned int d[BLOCK_SIZE]; + unsigned int w0[BLOCK_SIZE]; + unsigned int w1[BLOCK_SIZE]; + unsigned int w2[BLOCK_SIZE]; + unsigned int w3[BLOCK_SIZE]; + unsigned int w4[BLOCK_SIZE]; +}; +typedef struct curandStateXORWOWCoalesced_t curandStateCoalesced_t; + + +struct layout_config_t { + uint64_t iter_max; + uint64_t min_term_updates; + double eta_max; + double eps; + int32_t iter_with_max_learning_rate; + uint32_t first_cooling_iteration; + double theta; + uint32_t space; + uint32_t space_max; + uint32_t space_quantization_step; + int nthreads; +}; + + +void gpu_layout(layout_config_t config, const odgi::graph_t &graph, std::vector> &X, std::vector> &Y); + +} \ No newline at end of file diff --git a/src/subcommand/layout_main.cpp b/src/subcommand/layout_main.cpp index edae76e78..043c0a79e 100644 --- a/src/subcommand/layout_main.cpp +++ b/src/subcommand/layout_main.cpp @@ -85,6 +85,11 @@ int main_layout(int argc, char **argv) { args::ValueFlag nthreads(threading_opts, "N", "Number of threads to use for parallel operations.", {'t', "threads"}); +#ifdef USE_GPU + // GPU-enabled Layout + args::Group gpu_opts(parser, "[ GPU ]"); + args::Flag gpu_compute(gpu_opts, "gpu", "Enable computation with GPU.", {"gpu"}); +#endif args::Group processing_info_opts(parser, "[ Processsing Information ]"); args::Flag progress(processing_info_opts, "progress", "Write the current progress to stderr.", {'P', "progress"}); args::Group program_info_opts(parser, "[ Program Information ]"); @@ -325,29 +330,61 @@ int main_layout(int argc, char **argv) { }); //double max_x = 0; - algorithms::path_linear_sgd_layout( - graph, - path_index, - path_sgd_use_paths, - path_sgd_iter_max, - 0, - path_sgd_min_term_updates, - sgd_delta, - eps, - path_sgd_max_eta, - path_sgd_zipf_theta, - path_sgd_zipf_space, - path_sgd_zipf_space_max, - path_sgd_zipf_space_quantization_step, - path_sgd_cooling, - num_threads, - show_progress, - snapshot, - snapshot_prefix, - graph_X, - graph_Y - ); - +#ifdef USE_GPU + if (gpu_compute) { // run on GPU + algorithms::path_linear_sgd_layout_gpu( + graph, + path_index, + path_sgd_use_paths, + path_sgd_iter_max, + 0, + path_sgd_min_term_updates, + sgd_delta, + eps, + path_sgd_max_eta, + path_sgd_zipf_theta, + path_sgd_zipf_space, + path_sgd_zipf_space_max, + path_sgd_zipf_space_quantization_step, + path_sgd_cooling, + num_threads, + show_progress, + snapshot, + snapshot_prefix, + graph_X, + graph_Y + ); + } +#endif + +#ifdef USE_GPU + if (!gpu_compute) { // run on CPU +#endif + algorithms::path_linear_sgd_layout( + graph, + path_index, + path_sgd_use_paths, + path_sgd_iter_max, + 0, + path_sgd_min_term_updates, + sgd_delta, + eps, + path_sgd_max_eta, + path_sgd_zipf_theta, + path_sgd_zipf_space, + path_sgd_zipf_space_max, + path_sgd_zipf_space_quantization_step, + path_sgd_cooling, + num_threads, + show_progress, + snapshot, + snapshot_prefix, + graph_X, + graph_Y + ); +#ifdef USE_GPU + } +#endif // drop out of atomic stuff... maybe not the best way to do this // TODO: use directly the atomic vector? std::vector X_final(graph_X.size()); From dd7257ef9e4d3b0ceecc49535ea8e237727034a6 Mon Sep 17 00:00:00 2001 From: tonyjie Date: Sun, 27 Oct 2024 21:38:11 -0400 Subject: [PATCH 2/2] update readme for installation and description --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index d8789239b..9c07de01f 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,21 @@ Static builds are unlikely to be supported on OSX, and require appropriate stati For more information on optimisations, debugging and GNU Guix builds, see [INSTALL.md](./INSTALL.md) and [CMakeLists.txt](./CMakeLists.txt). +### building with GPU + +If you have GPUs and CUDA installed, you can build with GPU to use our GPU-accelerated `odgi-layout`. This will provide significant 57.3x speedup compared to the CPU solution on NVIDIA A100 GPU, reducing execution time from hours to minutes. Check out this [paper](https://arxiv.org/abs/2409.00876) and [repo](https://github.com/tonyjie/odgi) for the detailed performance speedup number. It's going to be presented at [SC'24](https://sc24.conference-program.com/presentation/?id=pap443&sess=sess382)! + +Simply build with `-DUSE_GPU=ON` when cmake: +``` +cmake -DUSE_GPU=ON -H. -Bbuild && cmake --build build -- -j 3 +``` + +To run `odgi layout` with GPU, simply add a `--gpu` with the other arguments like: +``` +odgi layout -i ${OG_FILE} -o ${LAY_FILE} --threads ${NUM_THREAD} --gpu +``` + + ### Nix build If you have `nix`, build and installation in your profile are as simple as: @@ -120,6 +135,8 @@ work with output from `odgi stats`! For more details take a look at the document **Andrea Guarracino\*, Simon Heumos\*, Sven Nahnsen, Pjotr Prins, Erik Garrison**. [ODGI: understanding pangenome graphs](https://doi.org/10.1093/bioinformatics/btac308), Bioinformatics, 2022\ **\*Shared first authorship** +**Jiajie Li, Jan-Niklas Schmelzle, Yixiao Du, Simon Heumos, Andrea Guarracino, Giulia Guidi, Pjotr Prins, Erik Garrison, Zhiru Zhang**. [Rapid GPU-Based Pangenome Graph Layout](https://arxiv.org/abs/2409.00876), SC (The International Conference for High Performance Computing, Networking, Storage, and Analysis), 2024 + ## funding sources `odgi` has been funded through a variety of mechanisms, including a Wellcome Sanger PhD fellowship and diverse NIH and NSF grants (listed in our paper), as well as funding from the State of Tennessee.