From 54f57547d3f706a6b443361aa40f5c115cb94a7c Mon Sep 17 00:00:00 2001
From: tonyjie <jiajieli0429@gmail.com>
Date: Sun, 27 Oct 2024 19:39:30 -0400
Subject: [PATCH 1/2] feature: enable GPU to accelerate odgi-layout with
 massive speedup. Build with a flag USE_GPU

---
 CMakeLists.txt                     |  39 ++-
 src/algorithms/path_sgd_layout.cpp |  39 ++-
 src/algorithms/path_sgd_layout.hpp |  26 +-
 src/cuda/layout.cu                 | 475 +++++++++++++++++++++++++++++
 src/cuda/layout.h                  |  82 +++++
 src/subcommand/layout_main.cpp     |  83 +++--
 6 files changed, 718 insertions(+), 26 deletions(-)
 create mode 100644 src/cuda/layout.cu
 create mode 100644 src/cuda/layout.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc6e9b444..dd781923d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,7 @@
 cmake_minimum_required(VERSION 3.16)
 
 # Project's name
-project(odgi)
+project(odgi LANGUAGES CXX)
 
 # Enforce c++17
 set(CMAKE_CXX_STANDARD 17)
@@ -30,6 +30,8 @@ set(CMAKE_CXX_STANDARD 17)
 option(PIC "Compile all odgi sources with -fPIC - required for shared libs" ON)
 option(ASAN "Use address sanitiser" OFF)
 option(INLINE_HANDLEGRAPH_SOURCES "Compile handlegraph sources inline" OFF)
+# Add the GPU option (default is OFF)
+option(USE_GPU "Enable GPU support if available" OFF)
 
 include(ExternalProject)
 include(FeatureSummary)
@@ -37,6 +39,18 @@ include(FeatureSummary)
 find_package(PkgConfig REQUIRED)
 find_package(pybind11 CONFIG)
 find_package(OpenMP)
+# Find CUDA if GPU option is enabled
+if (USE_GPU)
+    find_package(CUDA REQUIRED)  # Adjust this if you're using modern CMake with FindCUDAToolkit.
+    if(CUDA_FOUND)
+        enable_language(CUDA)
+        message(STATUS "CUDA found. GPU support enabled.")
+    else()
+        message(FATAL_ERROR "CUDA not found! Cannot enable GPU support.")
+    endif()
+else()
+    message(STATUS "Building with CPU-only support.")
+endif()
 
 feature_summary(
   FATAL_ON_MISSING_REQUIRED_PACKAGES
@@ -621,6 +635,9 @@ add_library(odgi_objs OBJECT
   ${lodepng_SOURCES}
   ${handlegraph_sources}
 )
+if (USE_GPU)
+  target_sources(odgi_objs PRIVATE "${CMAKE_SOURCE_DIR}/src/cuda/layout.cu")
+endif (USE_GPU)
 
 set(odgi_DEPS
     # lodepng
@@ -687,6 +704,9 @@ set(odgi_INCLUDES
   "${xoshiro_INCLUDE}"
   "${atomicbitvector_INCLUDE}"
   "${mio_INCLUDE}")
+if (USE_GPU)
+  list(APPEND odgi_INCLUDES "${CUDA_INCLUDE_DIRS}")
+endif (USE_GPU)
 
 set(odgi_LIBS
   jemalloc
@@ -803,9 +823,26 @@ set(odgi_HEADERS
   ${CMAKE_SOURCE_DIR}/src/algorithms/path_length.hpp
   ${CMAKE_SOURCE_DIR}/src/algorithms/path_keep.hpp
   ${CMAKE_SOURCE_DIR}/src/algorithms/diffpriv.cpp)
+if (USE_GPU)
+  list(APPEND odgi_HEADERS "${CMAKE_SOURCE_DIR}/src/cuda/layout.h")
+endif (USE_GPU)
 
 target_include_directories(odgi_objs PUBLIC ${odgi_INCLUDES})
 
+if (USE_GPU)
+  include(FindCUDA/select_compute_arch)
+  CUDA_DETECT_INSTALLED_GPUS(INSTALLED_GPU_CCS_1)
+  string(STRIP "${INSTALLED_GPU_CCS_1}" INSTALLED_GPU_CCS_2)
+  string(REPLACE " " ";" INSTALLED_GPU_CCS_3 "${INSTALLED_GPU_CCS_2}")
+  string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_3}")
+  SET(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST})
+  message(STATUS "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}")
+  # Apply compile options. Detects different GPU compute capability. 
+  target_compile_options(odgi_objs PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -std=c++17 -Xcompiler=-fopenmp -lineinfo>)
+  # add USE_GPU macro when building with GPU
+  target_compile_definitions(odgi_objs PRIVATE USE_GPU)
+endif (USE_GPU)
+
 add_library(libodgi_static STATIC $<TARGET_OBJECTS:odgi_objs>)
 set_target_properties(libodgi_static PROPERTIES OUTPUT_NAME "odgi")
 set_target_properties(libodgi_static PROPERTIES PUBLIC_HEADER "${odgi_HEADERS}")
diff --git a/src/algorithms/path_sgd_layout.cpp b/src/algorithms/path_sgd_layout.cpp
index 6167ea806..4a4d4c4e4 100644
--- a/src/algorithms/path_sgd_layout.cpp
+++ b/src/algorithms/path_sgd_layout.cpp
@@ -150,7 +150,7 @@ namespace odgi {
                                     } else {
                                         eta.store(etas[iteration]); // update our learning rate
                                         Delta_max.store(delta); // set our delta max to the threshold
-                                        if (iteration > first_cooling_iteration) {
+                                        if (iteration >= first_cooling_iteration) {
                                             //std::cerr << std::endl << "setting cooling!!" << std::endl;
                                             adj_theta.store(0.001);
                                             cooling.store(true);
@@ -466,6 +466,43 @@ namespace odgi {
 #endif
             return etas;
         }
+#ifdef USE_GPU
+        void path_linear_sgd_layout_gpu(const PathHandleGraph &graph,
+                                    const xp::XP &path_index,
+                                    const std::vector<path_handle_t> &path_sgd_use_paths,
+                                    const uint64_t &iter_max,
+                                    const uint64_t &iter_with_max_learning_rate,
+                                    const uint64_t &min_term_updates,
+                                    const double &delta,
+                                    const double &eps,
+                                    const double &eta_max,
+                                    const double &theta,
+                                    const uint64_t &space,
+                                    const uint64_t &space_max,
+                                    const uint64_t &space_quantization_step,
+                                    const double &cooling_start,
+                                    const uint64_t &nthreads,
+                                    const bool &progress,
+                                    const bool &snapshot,
+                                    const std::string &snapshot_prefix,
+                                    std::vector<std::atomic<double>> &X,
+                                    std::vector<std::atomic<double>> &Y) {
+            cuda::layout_config_t config;
+            config.iter_max = iter_max;
+            config.min_term_updates = min_term_updates;
+            config.eta_max = eta_max;
+            config.eps = eps;
+            config.iter_with_max_learning_rate = (int32_t)  iter_with_max_learning_rate;
+            config.first_cooling_iteration = std::floor(cooling_start * (double)iter_max);
+            config.theta = theta;
+            config.space = uint32_t(space);
+            config.space_max = uint32_t(space_max);
+            config.space_quantization_step = uint32_t(space_quantization_step);
+            config.nthreads = nthreads;
+            cuda::gpu_layout(config, dynamic_cast<const odgi::graph_t&>(graph), X, Y);
+            return;
+        }
+#endif
 /*
         void deterministic_path_linear_sgd(const PathHandleGraph &graph,
                                            const xp::XP &path_index,
diff --git a/src/algorithms/path_sgd_layout.hpp b/src/algorithms/path_sgd_layout.hpp
index 268248419..294962769 100644
--- a/src/algorithms/path_sgd_layout.hpp
+++ b/src/algorithms/path_sgd_layout.hpp
@@ -19,6 +19,9 @@
 #include "dirty_zipfian_int_distribution.h"
 #include "XoshiroCpp.hpp"
 #include "progress.hpp"
+#ifdef USE_GPU
+#include "cuda/layout.h"
+#endif
 
 namespace odgi {
     namespace algorithms {
@@ -53,7 +56,28 @@ namespace odgi {
                                                             const uint64_t &iter_max,
                                                             const uint64_t &iter_with_max_learning_rate,
                                                             const double &eps);
-
+#ifdef USE_GPU
+        void path_linear_sgd_layout_gpu(const PathHandleGraph &graph,
+                                    const xp::XP &path_index,
+                                    const std::vector<path_handle_t> &path_sgd_use_paths,
+                                    const uint64_t &iter_max,
+                                    const uint64_t &iter_with_max_learning_rate,
+                                    const uint64_t &min_term_updates,
+                                    const double &delta,
+                                    const double &eps,
+                                    const double &eta_max,
+                                    const double &theta,
+                                    const uint64_t &space,
+                                    const uint64_t &space_max,
+                                    const uint64_t &space_quantization_step,
+                                    const double &cooling_start,
+                                    const uint64_t &nthreads,
+                                    const bool &progress,
+                                    const bool &snapshot,
+                                    const std::string &snapshot_prefix,
+                                    std::vector<std::atomic<double>> &X,
+                                    std::vector<std::atomic<double>> &Y);
+#endif
 /// single threaded and deterministic path guided 1D linear SGD
 /*
         void deterministic_path_linear_sgd_layout(const PathHandleGraph &graph,
diff --git a/src/cuda/layout.cu b/src/cuda/layout.cu
new file mode 100644
index 000000000..52e574a27
--- /dev/null
+++ b/src/cuda/layout.cu
@@ -0,0 +1,475 @@
+#include "layout.h"
+#include <cuda.h>
+#include <assert.h>
+#include "cuda_runtime_api.h"
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if (err != cudaSuccess) {                         \
+    printf("Failed: Cuda error %s:%d '%s'\n",       \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    printf("Failed, NCCL error %s:%d '%s'\n",       \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+namespace cuda {
+
+__global__ void cuda_device_init(curandState_t *rnd_state_tmp, curandStateCoalesced_t *rnd_state) {
+    int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    // initialize curandState with original curand implementation
+    curand_init(42+tid, tid, 0, &rnd_state_tmp[tid]);
+    // copy to coalesced data structure
+    rnd_state[blockIdx.x].d[threadIdx.x] = rnd_state_tmp[tid].d;
+    rnd_state[blockIdx.x].w0[threadIdx.x] = rnd_state_tmp[tid].v[0];
+    rnd_state[blockIdx.x].w1[threadIdx.x] = rnd_state_tmp[tid].v[1];
+    rnd_state[blockIdx.x].w2[threadIdx.x] = rnd_state_tmp[tid].v[2];
+    rnd_state[blockIdx.x].w3[threadIdx.x] = rnd_state_tmp[tid].v[3];
+    rnd_state[blockIdx.x].w4[threadIdx.x] = rnd_state_tmp[tid].v[4];
+}
+
+/**
+ * @brief: Return 32-bits of pseudorandomness from an XORWOW generator. from "curand_kernel.h"
+ * For some use cases, we don't need floating point uniform distribution. So we don't need to call `curand_uniform_coalesced` as below. We shall use this function. 
+ * \param state - Pointer to state to update
+ * \param thread_id - Thread id
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+*/
+__device__ 
+unsigned int curand_coalesced(curandStateCoalesced_t *state, uint32_t thread_id) {
+    // Return 32-bits of pseudorandomness from an XORWOW generator. 
+    uint32_t t;
+    t = (state->w0[thread_id] ^ (state->w0[thread_id] >> 2));
+    state->w0[thread_id] = state->w1[thread_id];
+    state->w1[thread_id] = state->w2[thread_id];
+    state->w2[thread_id] = state->w3[thread_id];
+    state->w3[thread_id] = state->w4[thread_id];
+    state->w4[thread_id] = (state->w4[thread_id] ^ (state->w4[thread_id] << 4)) ^ (t ^ (t << 1));
+    state->d[thread_id] += 362437;    
+    return state->w4[thread_id] + state->d[thread_id];
+}
+
+__device__
+float curand_uniform_coalesced(curandStateCoalesced_t *state, uint32_t thread_id) {
+    // generate 32 bit pseudorandom value with XORWOW generator (see paper "Xorshift RNGs" by George Marsaglia);
+    // also used in curand library (see curand_kernel.h)
+    uint32_t t;
+    t = state->w0[thread_id] ^ (state->w0[thread_id] >> 2);
+    state->w0[thread_id] = state->w1[thread_id];
+    state->w1[thread_id] = state->w2[thread_id];
+    state->w2[thread_id] = state->w3[thread_id];
+    state->w3[thread_id] = state->w4[thread_id];
+    state->w4[thread_id] = (state->w4[thread_id] ^ (state->w4[thread_id] << 4)) ^ (t ^ (t << 1));
+    state->d[thread_id] += 362437;
+
+    uint32_t rnd_uint = state->d[thread_id] + state->w4[thread_id];
+
+    // convert to float; see curand_uniform.h
+    return _curand_uniform(rnd_uint);
+}
+
+
+__device__ double compute_zeta(uint32_t n, double theta) {
+    double ans = 0.0;
+    for (uint32_t i = 1; i <= n; i++) {
+        ans += pow(1.0 / double(i), theta);
+    }
+    return ans;
+}
+
+// this function uses the cuda operation __powf, which is a faster but less precise alternative to the pow operation
+__device__ uint32_t cuda_rnd_zipf(curandStateCoalesced_t *rnd_state, uint32_t n, double theta, double zeta2, double zetan) {
+    double alpha = 1.0 / (1.0 - theta);
+    double denominator = 1.0 - zeta2 / zetan;
+    if (denominator == 0.0) {
+        denominator = 1e-9;
+    }
+    double eta = (1.0 - __powf(2.0 / double(n), 1.0 - theta)) / (denominator);
+
+    // INFO: curand_uniform generates random values between 0.0 (excluded) and 1.0 (included)
+    double u = 1.0 - curand_uniform_coalesced(rnd_state, threadIdx.x);
+    double uz = u * zetan;
+
+    int64_t val = 0;
+    if (uz < 1.0) val = 1;
+    else if (uz < 1.0 + __powf(0.5, theta)) val = 2;
+    else val = 1 + int64_t(double(n) * __powf(eta * u - eta + 1.0, alpha));
+
+    if (val > n) {
+        //printf("WARNING: val: %ld, n: %u\n", val, uint32_t(n));
+        val--;
+    }
+    assert(val >= 0);
+    assert(val <= n);
+    return uint32_t(val);
+}
+
+
+static __device__ __inline__ uint32_t __mysmid(){
+    uint32_t smid;
+    asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
+    return smid;
+}
+
+/**
+* @brief: update the coordinates of two visualization nodes in the 2D layout space
+* This function is called multiple times in one `gpu_layout_kernel` in order to increase the data reuse. 
+* Each time, the warp shuffle intrinsics are used to change the selection of node 2 among the 32 threads in the warp. 
+* E.g. Iter : Step Pairs Selected would be: 
+*     1: (a0, b0), (a1, b1), (a2, b2), ..., (a31, b31)
+*     2: (a0, b9), (a1, b0), (a2, b3), ..., (a31, b4)
+*     3: (a0, b1), (a1, b4), (a2, b1), ..., (a31, b10)
+*     ...
+* `b` is randomly chosen from the 32 threads in the warp. 
+* @param n1_pos_in_path: position of node 1 in the current selected path
+* @param n1_id: id of node 1
+* @param n1_offset: offset of node 1
+* @param n2_pos_in_path: position of node 2 in the current selected path
+* @param n2_id: id of node 2
+* @param n2_offset: offset of node 2
+* @param eta: an coefficient used in the update formula
+* @param node_data: the data structure that stores the coordinates of all nodes
+*/
+__device__
+void update_pos_gpu(int64_t &n1_pos_in_path, uint32_t &n1_id, int &n1_offset,
+                    int64_t &n2_pos_in_path, uint32_t &n2_id, int &n2_offset,
+                    double eta, 
+                    cuda::node_data_t &node_data) {
+    double term_dist = std::abs(static_cast<double>(n1_pos_in_path) - static_cast<double>(n2_pos_in_path));
+
+    if (term_dist < 1e-9) {
+        term_dist = 1e-9;
+    }
+
+    double w_ij = 1.0 / term_dist;
+
+    double mu = eta * w_ij;
+    if (mu > 1.0) {
+        mu = 1.0;
+    }
+
+    float *x1 = &node_data.nodes[n1_id].coords[n1_offset];
+    float *x2 = &node_data.nodes[n2_id].coords[n2_offset];
+    float *y1 = &node_data.nodes[n1_id].coords[n1_offset + 1];
+    float *y2 = &node_data.nodes[n2_id].coords[n2_offset + 1];
+    double x1_val = double(*x1);
+    double x2_val = double(*x2);
+    double y1_val = double(*y1);
+    double y2_val = double(*y2);
+
+    double dx = x1_val - x2_val;
+    double dy = y1_val - y2_val;
+
+    if (dx == 0.0) {
+        dx = 1e-9;
+    }
+
+    double mag = sqrt(dx * dx + dy * dy);
+    double delta = mu * (mag - term_dist) / 2.0;
+    //double delta_abs = std::abs(delta);
+
+    // TODO implement delta max stop functionality
+    double r = delta / mag;
+    double r_x = r * dx;
+    double r_y = r * dy;
+    // TODO check current value before updating
+    atomicExch(x1, float(x1_val - r_x));
+    atomicExch(x2, float(x2_val + r_x));
+    atomicExch(y1, float(y1_val - r_y));
+    atomicExch(y2, float(y2_val + r_y)); 
+}
+
+__global__ 
+void gpu_layout_kernel(int iter, cuda::layout_config_t config, curandStateCoalesced_t *rnd_state, double eta, double *zetas, 
+                                   cuda::node_data_t node_data, cuda::path_data_t path_data, int sm_count) {
+    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t smid = __mysmid();
+    assert(smid < sm_count);
+
+    curandStateCoalesced_t *thread_rnd_state = &rnd_state[smid];
+
+    __shared__ bool cooling[BLOCK_SIZE / WARP_SIZE]; 
+    if (threadIdx.x % WARP_SIZE == 1) {
+        cooling[threadIdx.x / WARP_SIZE] = (iter >= config.first_cooling_iteration) || (curand_coalesced(thread_rnd_state, threadIdx.x) % 2 == 0);
+    }
+
+    // select path
+    __shared__ uint32_t first_step_idx[BLOCK_SIZE / WARP_SIZE]; // BLOCK_SIZE/WARP_SIZE = 1024/32 = 32
+    // each thread picks its own path
+    uint32_t step_idx = curand_coalesced(thread_rnd_state, threadIdx.x) % path_data.total_path_steps;
+
+    uint32_t path_idx = path_data.element_array[step_idx].pidx;
+    path_t p = path_data.paths[path_idx];
+
+    if (p.step_count < 2) {
+        return;
+    }
+    assert(p.step_count > 1);
+
+    // INFO: curand_uniform generates random values between 0.0 (excluded) and 1.0 (included)
+    uint32_t s1_idx = curand_coalesced(thread_rnd_state, threadIdx.x) % p.step_count;
+    assert(s1_idx < p.step_count);
+    uint32_t s2_idx;
+
+    if (cooling[threadIdx.x / WARP_SIZE]) {
+        bool backward;
+        uint32_t jump_space;
+        if (s1_idx > 0 && (curand_coalesced(thread_rnd_state, threadIdx.x) % 2 == 0) || s1_idx == p.step_count-1) {
+            // go backward
+            backward = true;
+            jump_space = min(config.space, s1_idx);
+        } else {
+            // go forward
+            backward = false;
+            jump_space = min(config.space, p.step_count - s1_idx - 1);
+        }
+        uint32_t space = jump_space;
+        if (jump_space > config.space_max) {
+            space = config.space_max + (jump_space - config.space_max) / config.space_quantization_step + 1;
+        }
+
+        uint32_t z_i = cuda_rnd_zipf(thread_rnd_state, jump_space, config.theta, zetas[2], zetas[space]);
+
+        s2_idx = backward ? s1_idx - z_i : s1_idx + z_i;
+    } else {
+        do {
+            s2_idx = curand_coalesced(thread_rnd_state, threadIdx.x) % p.step_count;
+        } while (s1_idx == s2_idx);
+    }
+    assert(s1_idx < p.step_count);
+    assert(s2_idx < p.step_count);
+    assert(s1_idx != s2_idx);
+
+
+    uint32_t n1_id = p.elements[s1_idx].node_id;
+    int64_t n1_pos_in_path = p.elements[s1_idx].pos;
+    bool n1_is_rev = (n1_pos_in_path < 0)? true: false;
+    n1_pos_in_path = std::abs(n1_pos_in_path);
+
+    uint32_t n2_id = p.elements[s2_idx].node_id;
+    int64_t n2_pos_in_path = p.elements[s2_idx].pos;
+    bool n2_is_rev = (n2_pos_in_path < 0)? true: false;
+    n2_pos_in_path = std::abs(n2_pos_in_path);
+
+    uint32_t n1_seq_length = node_data.nodes[n1_id].seq_length;
+    bool n1_use_other_end = (curand_coalesced(thread_rnd_state, threadIdx.x) % 2 == 0) ? true : false;
+    if (n1_use_other_end) {
+        n1_pos_in_path += uint64_t{n1_seq_length};
+        n1_use_other_end = !n1_is_rev;
+    } else {
+        n1_use_other_end = n1_is_rev;
+    }
+
+    uint32_t n2_seq_length = node_data.nodes[n2_id].seq_length;
+    bool n2_use_other_end = (curand_coalesced(thread_rnd_state, threadIdx.x) % 2 == 0) ? true : false;
+    if (n2_use_other_end) {
+        n2_pos_in_path += uint64_t{n2_seq_length};
+        n2_use_other_end = !n2_is_rev;
+    } else {
+        n2_use_other_end = n2_is_rev;
+    }
+
+    int n1_offset = n1_use_other_end? 2: 0;
+    int n2_offset = n2_use_other_end? 2: 0;
+
+    // Update Coordinates based on the data of selected nodes: n_pos_in_path, n_id, n_offset
+    update_pos_gpu(n1_pos_in_path, n1_id, n1_offset, 
+                   n2_pos_in_path, n2_id, n2_offset, 
+                   eta, node_data);
+}
+
+
+void gpu_layout(layout_config_t config, const odgi::graph_t &graph, std::vector<std::atomic<double>> &X, std::vector<std::atomic<double>> &Y) {
+
+
+    std::cout << "===== Use GPU to compute odgi-layout =====" << std::endl;
+    // get cuda device property, and get the SM count
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, 0));
+    int sm_count = prop.multiProcessorCount;
+
+    // create eta array
+    double *etas;
+    cudaMallocManaged(&etas, config.iter_max * sizeof(double));
+
+    const int32_t iter_max = config.iter_max;
+    const int32_t iter_with_max_learning_rate = config.iter_with_max_learning_rate;
+    const double w_max = 1.0;
+    const double eps = config.eps;
+    const double eta_max = config.eta_max;
+    const double eta_min = eps / w_max;
+    const double lambda = log(eta_max / eta_min) / ((double) iter_max - 1);
+    for (int32_t i = 0; i < config.iter_max; i++) {
+        double eta = eta_max * exp(-lambda * (std::abs(i - iter_with_max_learning_rate)));
+        etas[i] = isnan(eta)? eta_min : eta;
+    }
+
+    // create node data structure
+    // consisting of sequence length and coords
+    uint32_t node_count = graph.get_node_count();
+    assert(graph.min_node_id() == 1);
+    assert(graph.max_node_id() == node_count);
+    assert(graph.max_node_id() - graph.min_node_id() + 1 == node_count);
+
+    cuda::node_data_t node_data;
+    node_data.node_count = node_count;
+    cudaMallocManaged(&node_data.nodes, node_count * sizeof(cuda::node_t));
+    for (int node_idx = 0; node_idx < node_count; node_idx++) {
+        //assert(graph.has_node(node_idx));
+        cuda::node_t *n_tmp = &node_data.nodes[node_idx];
+
+        // sequence length
+        const handlegraph::handle_t h = graph.get_handle(node_idx + 1, false);
+        // NOTE: unable store orientation (reverse), since this information is path dependent
+        n_tmp->seq_length = graph.get_length(h);
+
+        // copy random coordinates
+        n_tmp->coords[0] = float(X[node_idx * 2].load());
+        n_tmp->coords[1] = float(Y[node_idx * 2].load());
+        n_tmp->coords[2] = float(X[node_idx * 2 + 1].load());
+        n_tmp->coords[3] = float(Y[node_idx * 2 + 1].load());
+    }
+
+
+    // create path data structure
+    uint32_t path_count = graph.get_path_count();
+    cuda::path_data_t path_data;
+    path_data.path_count = path_count;
+    path_data.total_path_steps = 0;
+    cudaMallocManaged(&path_data.paths, path_count * sizeof(cuda::path_t));
+
+    vector<odgi::path_handle_t> path_handles{};
+    path_handles.reserve(path_count);
+    graph.for_each_path_handle(
+        [&] (const odgi::path_handle_t& p) {
+            path_handles.push_back(p);
+            path_data.total_path_steps += graph.get_step_count(p);
+        });
+    cudaMallocManaged(&path_data.element_array, path_data.total_path_steps * sizeof(path_element_t));
+
+    // get length and starting position of all paths
+    uint64_t first_step_counter = 0;
+    for (int path_idx = 0; path_idx < path_count; path_idx++) {
+        odgi::path_handle_t p = path_handles[path_idx];
+        int step_count = graph.get_step_count(p);
+        path_data.paths[path_idx].step_count = step_count;
+        path_data.paths[path_idx].first_step_in_path = first_step_counter;
+        first_step_counter += step_count;
+    }
+
+#pragma omp parallel for num_threads(config.nthreads)
+    for (int path_idx = 0; path_idx < path_count; path_idx++) {
+        odgi::path_handle_t p = path_handles[path_idx];
+        //std::cout << graph.get_path_name(p) << ": " << graph.get_step_count(p) << std::endl;
+
+        uint32_t step_count = path_data.paths[path_idx].step_count;
+        uint64_t first_step_in_path = path_data.paths[path_idx].first_step_in_path;
+        if (step_count == 0) {
+            path_data.paths[path_idx].elements = NULL;
+        } else {
+            path_element_t *cur_path = &path_data.element_array[first_step_in_path];
+            path_data.paths[path_idx].elements = cur_path;
+
+            odgi::step_handle_t s = graph.path_begin(p);
+            int64_t pos = 1;
+            // Iterate through path
+            for (int step_idx = 0; step_idx < step_count; step_idx++) {
+                odgi::handle_t h = graph.get_handle_of_step(s);
+                //std::cout << graph.get_id(h) << std::endl;
+
+                cur_path[step_idx].node_id = graph.get_id(h) - 1;
+                cur_path[step_idx].pidx = uint32_t(path_idx);
+                // store position negative when handle reverse
+                if (graph.get_is_reverse(h)) {
+                    cur_path[step_idx].pos = -pos;
+                } else {
+                    cur_path[step_idx].pos = pos;
+                }
+                pos += graph.get_length(h);
+
+                // get next step
+                if (graph.has_next_step(s)) {
+                    s = graph.get_next_step(s);
+                } else if (!(step_idx == step_count-1)) {
+                    // should never be reached
+                    std::cout << "Error: Here should be another step" << std::endl;
+                }
+            }
+        }
+    }
+
+    // cache zipf zetas
+    auto start_zeta = std::chrono::high_resolution_clock::now();
+    double *zetas;
+    uint64_t zetas_cnt = ((config.space <= config.space_max)? config.space : (config.space_max + (config.space - config.space_max) / config.space_quantization_step + 1)) + 1;
+    cudaMallocManaged(&zetas, zetas_cnt * sizeof(double));
+    double zeta_tmp = 0.0;
+    for (uint64_t i = 1; i < config.space + 1; i++) {
+        zeta_tmp += dirtyzipf::fast_precise_pow(1.0 / i, config.theta);
+        if (i <= config.space_max) {
+            zetas[i] = zeta_tmp;
+        }
+        if (i >= config.space_max && (i - config.space_max) % config.space_quantization_step == 0) {
+            zetas[config.space_max + 1 + (i - config.space_max) / config.space_quantization_step] = zeta_tmp;
+        }
+    }
+    auto end_zeta = std::chrono::high_resolution_clock::now();
+    uint32_t duration_zeta_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end_zeta - start_zeta).count();
+
+    const uint64_t block_size = BLOCK_SIZE;
+    uint64_t block_nbr = (config.min_term_updates + block_size - 1) / block_size; 
+
+    curandState_t *rnd_state_tmp;
+    curandStateCoalesced_t *rnd_state;
+    CUDACHECK(cudaMallocManaged(&rnd_state_tmp, sm_count * block_size * sizeof(curandState_t)));
+    CUDACHECK(cudaMallocManaged(&rnd_state, sm_count * sizeof(curandStateCoalesced_t)));
+    cuda_device_init<<<sm_count, block_size>>>(rnd_state_tmp, rnd_state);
+    CUDACHECK(cudaGetLastError());
+    CUDACHECK(cudaDeviceSynchronize());
+    cudaFree(rnd_state_tmp);
+
+    for (int iter = 0; iter < config.iter_max; iter++) {
+        gpu_layout_kernel<<<block_nbr, block_size>>>(iter, config, rnd_state, etas[iter], zetas, node_data, path_data, sm_count);
+        // check error
+        CUDACHECK(cudaGetLastError());
+        CUDACHECK(cudaDeviceSynchronize());
+    }
+
+    // copy coords back to X, Y vectors
+    for (int node_idx = 0; node_idx < node_count; node_idx++) {
+        cuda::node_t *n = &(node_data.nodes[node_idx]);
+        // coords[0], coords[1], coords[2], coords[3] are stored consecutively. 
+        float *coords = n->coords;
+        // check if coordinates valid (not NaN or infinite)
+        for (int i = 0; i < 4; i++) {
+            if (!isfinite(coords[i])) {
+                std::cout << "WARNING: invalid coordiate" << std::endl;
+            }
+        }
+        X[node_idx * 2].store(double(coords[0]));
+        Y[node_idx * 2].store(double(coords[1]));
+        X[node_idx * 2 + 1].store(double(coords[2]));
+        Y[node_idx * 2 + 1].store(double(coords[3]));
+        //std::cout << "coords of " << node_idx << ": [" << X[node_idx*2] << "; " << Y[node_idx*2] << "] ; [" << X[node_idx*2+1] << "; " << Y[node_idx*2+1] <<"]\n";
+    }
+
+    // free memory
+    cudaFree(etas);
+    cudaFree(node_data.nodes);
+    cudaFree(path_data.paths);
+    cudaFree(path_data.element_array);
+    cudaFree(zetas);
+    cudaFree(rnd_state);
+
+    return;
+}
+
+}
\ No newline at end of file
diff --git a/src/cuda/layout.h b/src/cuda/layout.h
new file mode 100644
index 000000000..9de514ea1
--- /dev/null
+++ b/src/cuda/layout.h
@@ -0,0 +1,82 @@
+#pragma once
+#include <iostream>
+#include <chrono>
+#include <vector>
+#include <omp.h>
+#include <random>
+#include <math.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <sstream>
+#include <iomanip>
+
+#include "odgi.hpp"
+#include "XoshiroCpp.hpp"
+#include "dirty_zipfian_int_distribution.h"
+
+namespace cuda {
+
+
+struct __align__(8) node_t {
+    float coords[4];
+    int32_t seq_length;
+};
+struct node_data_t {
+    uint32_t node_count;
+    node_t *nodes;
+};
+
+
+struct __align__(8) path_element_t {
+    uint32_t pidx;
+    uint32_t node_id;
+    int64_t pos;    // if position negative: reverse orientation
+};
+
+struct path_t {
+    uint32_t step_count;
+    uint64_t first_step_in_path;  // precomputed position in path
+    path_element_t *elements;
+};
+
+struct path_data_t {
+    uint32_t path_count;
+    uint64_t total_path_steps;
+    path_t *paths;
+    path_element_t *element_array;
+};
+
+
+// #define SM_COUNT 84
+// #define SM_COUNT 108
+#define BLOCK_SIZE 1024
+#define WARP_SIZE 32
+struct curandStateXORWOWCoalesced_t {
+    unsigned int d[BLOCK_SIZE];
+    unsigned int w0[BLOCK_SIZE];
+    unsigned int w1[BLOCK_SIZE];
+    unsigned int w2[BLOCK_SIZE];
+    unsigned int w3[BLOCK_SIZE];
+    unsigned int w4[BLOCK_SIZE];
+};
+typedef struct curandStateXORWOWCoalesced_t curandStateCoalesced_t;
+
+
+struct layout_config_t {
+    uint64_t iter_max;
+    uint64_t min_term_updates;
+    double eta_max;
+    double eps;
+    int32_t iter_with_max_learning_rate;
+    uint32_t first_cooling_iteration;
+    double theta;
+    uint32_t space;
+    uint32_t space_max;
+    uint32_t space_quantization_step;
+    int nthreads;
+};
+
+
+void gpu_layout(layout_config_t config, const odgi::graph_t &graph, std::vector<std::atomic<double>> &X, std::vector<std::atomic<double>> &Y);
+
+}
\ No newline at end of file
diff --git a/src/subcommand/layout_main.cpp b/src/subcommand/layout_main.cpp
index edae76e78..043c0a79e 100644
--- a/src/subcommand/layout_main.cpp
+++ b/src/subcommand/layout_main.cpp
@@ -85,6 +85,11 @@ int main_layout(int argc, char **argv) {
     args::ValueFlag<uint64_t> nthreads(threading_opts, "N",
                                        "Number of threads to use for parallel operations.",
                                        {'t', "threads"});
+#ifdef USE_GPU
+    // GPU-enabled Layout
+    args::Group gpu_opts(parser, "[ GPU ]");
+    args::Flag gpu_compute(gpu_opts, "gpu", "Enable computation with GPU.", {"gpu"});
+#endif
     args::Group processing_info_opts(parser, "[ Processsing Information ]");
     args::Flag progress(processing_info_opts, "progress", "Write the current progress to stderr.", {'P', "progress"});
     args::Group program_info_opts(parser, "[ Program Information ]");
@@ -325,29 +330,61 @@ int main_layout(int argc, char **argv) {
       });
 
     //double max_x = 0;
-    algorithms::path_linear_sgd_layout(
-        graph,
-        path_index,
-        path_sgd_use_paths,
-        path_sgd_iter_max,
-        0,
-        path_sgd_min_term_updates,
-        sgd_delta,
-        eps,
-        path_sgd_max_eta,
-        path_sgd_zipf_theta,
-        path_sgd_zipf_space,
-        path_sgd_zipf_space_max,
-        path_sgd_zipf_space_quantization_step,
-        path_sgd_cooling,
-        num_threads,
-        show_progress,
-        snapshot,
-        snapshot_prefix,
-        graph_X,
-        graph_Y
-        );
-
+#ifdef USE_GPU
+    if (gpu_compute) { // run on GPU
+        algorithms::path_linear_sgd_layout_gpu(
+            graph,
+            path_index,
+            path_sgd_use_paths,
+            path_sgd_iter_max,
+            0,
+            path_sgd_min_term_updates,
+            sgd_delta,
+            eps,
+            path_sgd_max_eta,
+            path_sgd_zipf_theta,
+            path_sgd_zipf_space,
+            path_sgd_zipf_space_max,
+            path_sgd_zipf_space_quantization_step,
+            path_sgd_cooling,
+            num_threads,
+            show_progress,
+            snapshot,
+            snapshot_prefix,
+            graph_X,
+            graph_Y
+            );
+    } 
+#endif
+
+#ifdef USE_GPU
+    if (!gpu_compute) { // run on CPU
+#endif
+        algorithms::path_linear_sgd_layout(
+            graph,
+            path_index,
+            path_sgd_use_paths,
+            path_sgd_iter_max,
+            0,
+            path_sgd_min_term_updates,
+            sgd_delta,
+            eps,
+            path_sgd_max_eta,
+            path_sgd_zipf_theta,
+            path_sgd_zipf_space,
+            path_sgd_zipf_space_max,
+            path_sgd_zipf_space_quantization_step,
+            path_sgd_cooling,
+            num_threads,
+            show_progress,
+            snapshot,
+            snapshot_prefix,
+            graph_X,
+            graph_Y
+            );
+#ifdef USE_GPU
+    }
+#endif
     // drop out of atomic stuff... maybe not the best way to do this
     // TODO: use directly the atomic vector?
     std::vector<double> X_final(graph_X.size());

From dd7257ef9e4d3b0ceecc49535ea8e237727034a6 Mon Sep 17 00:00:00 2001
From: tonyjie <jiajieli0429@gmail.com>
Date: Sun, 27 Oct 2024 21:38:11 -0400
Subject: [PATCH 2/2] update readme for installation and description

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index d8789239b..9c07de01f 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,21 @@ Static builds are unlikely to be supported on OSX, and require appropriate stati
 
 For more information on optimisations, debugging and GNU Guix builds, see [INSTALL.md](./INSTALL.md) and [CMakeLists.txt](./CMakeLists.txt).
 
+### building with GPU
+
+If you have GPUs and CUDA installed, you can build with GPU to use our GPU-accelerated `odgi-layout`. This will provide significant 57.3x speedup compared to the CPU solution on NVIDIA A100 GPU, reducing execution time from hours to minutes. Check out this [paper](https://arxiv.org/abs/2409.00876) and [repo](https://github.com/tonyjie/odgi) for the detailed performance speedup number. It's going to be presented at [SC'24](https://sc24.conference-program.com/presentation/?id=pap443&sess=sess382)!
+
+Simply build with `-DUSE_GPU=ON` when cmake: 
+```
+cmake -DUSE_GPU=ON -H. -Bbuild && cmake --build build -- -j 3
+```
+
+To run `odgi layout` with GPU, simply add a `--gpu` with the other arguments like: 
+```
+odgi layout -i ${OG_FILE} -o ${LAY_FILE} --threads ${NUM_THREAD} --gpu
+```
+
+
 ### Nix build
 
 If you have `nix`, build and installation in your profile are as simple as:
@@ -120,6 +135,8 @@ work with output from `odgi stats`! For more details take a look at the document
 **Andrea Guarracino\*, Simon Heumos\*, Sven Nahnsen, Pjotr Prins, Erik Garrison**. [ODGI: understanding pangenome graphs](https://doi.org/10.1093/bioinformatics/btac308), Bioinformatics, 2022\
 **\*Shared first authorship**
 
+**Jiajie Li, Jan-Niklas Schmelzle, Yixiao Du, Simon Heumos, Andrea Guarracino, Giulia Guidi, Pjotr Prins, Erik Garrison, Zhiru Zhang**. [Rapid GPU-Based Pangenome Graph Layout](https://arxiv.org/abs/2409.00876), SC (The International Conference for High Performance Computing, Networking, Storage, and Analysis), 2024
+
 ## funding sources
 
 `odgi` has been funded through a variety of mechanisms, including a Wellcome Sanger PhD fellowship and diverse NIH and NSF grants (listed in our paper), as well as funding from the State of Tennessee.