Skip to content

Commit

Permalink
opencl tile-based
Browse files Browse the repository at this point in the history
  • Loading branch information
kali committed Sep 6, 2022
1 parent 8423806 commit 6f34d8a
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 91 deletions.
106 changes: 46 additions & 60 deletions linalg/matmul-bench/benches/matmul.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,47 +8,10 @@ extern crate cblas;

use criterion::measurement::WallTime;
use criterion::*;
use tract_data::internal::*;

macro_rules! b {
($id:ident) => {
b!($id, None);
};
($id:ident, $tile_constraint:expr) => {
pub fn $id(crit: &mut BenchmarkGroup<WallTime>, m: usize, k: usize, n: usize) {
let constraint: Option<(usize, usize)> = $tile_constraint;
if let Some((mr, nr)) = constraint {
if m % mr != 0 || n % nr != 0 {
return
}
}
let a = vec![0f32; m * k];
let b = vec![0f32; k * n];
let mut c = vec![0f32; m * n];
crit.bench_function(stringify!($id), |be| {
be.iter(|| matmul_bench::$id(m, k, n, &a, &b, &mut c))
});
}

};
}

b!(naive);
b!(ctile_1x1);
b!(tile_2x2);
b!(ctile_2x2);
b!(tile_4x4);
b!(ctile_4x4);
b!(cpacked_tile_4x4);
b!(tile_8x8);
b!(ctile_8x8);
b!(cpacked_tile_8x8);
b!(matrixmultiply);
#[cfg(feature = "blas")]
b!(cblas);
b!(tract);
#[cfg(feature = "opencl")]
b!(opencl, Some((8, 8)));
use matmul_bench::opencl::*;
use matmul_bench::*;
use tract_data::internal::*;

pub fn tract_blaslike(
crit: &mut BenchmarkGroup<WallTime>,
Expand Down Expand Up @@ -100,28 +63,51 @@ pub fn tract_blaslike(
}
}

fn matmul(c: &mut Criterion, m: usize, k: usize, n: usize) {
let mut c = c.benchmark_group(format!("{}x{}x{}", m, k, n));
c.throughput(Throughput::Elements((m * k * n) as _));
naive(&mut c, m, k, n);
ctile_1x1(&mut c, m, k, n);
tile_2x2(&mut c, m, k, n);
ctile_2x2(&mut c, m, k, n);
tile_4x4(&mut c, m, k, n);
ctile_4x4(&mut c, m, k, n);
cpacked_tile_4x4(&mut c, m, k, n);
tile_8x8(&mut c, m, k, n);
ctile_8x8(&mut c, m, k, n);
cpacked_tile_8x8(&mut c, m, k, n);
matrixmultiply(&mut c, m, k, n);
fn matmul(crit: &mut Criterion, m: usize, k: usize, n: usize) {
let mut crit = crit.benchmark_group(format!("{}x{}x{}", m, k, n));
crit.throughput(Throughput::Elements((m * k * n) as _));

let a = vec![0f32; m * k];
let b = vec![0f32; k * n];
let mut c = vec![0f32; m * n];

macro_rules! b {
($id:ident) => {
b!($id, None);
};
($id:ident, $tile_constraint:expr) => {
let constraint: Option<(usize, usize)> = $tile_constraint;
if let Some((mr, nr)) = constraint {
if m % mr != 0 || n % nr != 0 {
return;
}
}
crit.bench_function(stringify!($id), |be| be.iter(|| $id(m, k, n, &a, &b, &mut c)));
};
}

b!(naive);
b!(ctile_1x1);
b!(tile_2x2);
b!(ctile_2x2);
b!(tile_4x4);
b!(ctile_4x4);
b!(cpacked_tile_4x4);
b!(tile_8x8);
b!(ctile_8x8);
b!(cpacked_tile_8x8);
b!(matrixmultiply);
#[cfg(feature = "blas")]
cblas(&mut c, m, k, n);
tract(&mut c, m, k, n);
tract_blaslike(&mut c, m, k, n, f32::datum_type());
tract_blaslike(&mut c, m, k, n, f16::datum_type());
b!(cblas);
b!(tract);
#[cfg(feature = "opencl")]
opencl(&mut c, m, k, n);
c.finish();
{
b!(opencl_gemm1);
b!(opencl_gemm_1_with_local_2x2, Some((2, 2)));
}
tract_blaslike(&mut crit, m, k, n, f32::datum_type());
tract_blaslike(&mut crit, m, k, n, f16::datum_type());
crit.finish();
}

fn big(c: &mut Criterion) {
Expand Down
24 changes: 9 additions & 15 deletions linalg/matmul-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ extern crate blis_src;
extern crate cblas;

#[cfg(feature = "opencl")]
mod opencl;
pub mod opencl;

use tract_data::internal::*;
use tract_linalg::frame::mmm::FusedSpec;
Expand Down Expand Up @@ -408,12 +408,6 @@ pub fn cblas(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32])
}
}

#[allow(unused_variables, unused_mut)]
#[cfg(feature = "opencl")]
pub fn opencl(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32]) {
opencl::run(m, k, n, a, b, c)
}

pub fn tract(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32]) {
unsafe {
let mmm = tract_linalg::ops()
Expand Down Expand Up @@ -464,9 +458,9 @@ pub fn tract(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32])

#[cfg(test)]
mod test {
use tract_data::internal::DimLike;
use super::*;

fn pack_a(a: &[f32], m: usize, k: usize, r: usize) -> Vec<f32> {
pub fn pack_a(a: &[f32], m: usize, k: usize, r: usize) -> Vec<f32> {
let panels = m.divceil(r);
let mut pa = vec![0f32; m * k];
for p in 0..panels {
Expand All @@ -482,7 +476,7 @@ mod test {
pa
}

fn pack_b(b: &[f32], k: usize, n: usize, r: usize) -> Vec<f32> {
pub fn pack_b(b: &[f32], k: usize, n: usize, r: usize) -> Vec<f32> {
let panels = n.divceil(r);
let mut pb = vec![0f32; k * n];
for p in 0..panels {
Expand All @@ -497,6 +491,8 @@ mod test {
}
pb
}

#[macro_export]
macro_rules! t {
($id:ident) => {
t!($id, None);
Expand All @@ -521,10 +517,10 @@ mod test {
}
}
if let Some(r) = $pack {
a = pack_a(&*a, m, k, r);
b = pack_b(&*b, k, n, r);
a = $crate::test::pack_a(&*a, m, k, r);
b = $crate::test::pack_b(&*b, k, n, r);
}
crate::$id(m, k, n, &a, &b, &mut found);
$id(m, k, n, &a, &b, &mut found);
assert_eq!(found, expected);
}
}
Expand All @@ -545,6 +541,4 @@ mod test {
#[cfg(feature = "blas")]
t!(cblas);
t!(tract);
#[cfg(feature = "opencl")]
t!(opencl);
}
71 changes: 55 additions & 16 deletions linalg/matmul-bench/src/opencl.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{ptr::null_mut, sync::Mutex};
use std::ptr::null_mut;

use opencl3::{
command_queue::{CommandQueue, CL_QUEUE_PROFILING_ENABLE},
Expand All @@ -12,7 +12,7 @@ use opencl3::{
types::{cl_float, CL_NON_BLOCKING},
};

struct Gpu {
pub struct Gpu {
context: Context,
queue: CommandQueue,
kernel: Kernel,
Expand All @@ -30,9 +30,13 @@ impl Gpu {
let device = Device::new(device);
let context = Context::from_device(&device).expect("Context::from_device failed");

let queue =
CommandQueue::create(&context, context.default_device(), CL_QUEUE_PROFILING_ENABLE)
.expect("CommandQueue::create failed");
let queue = CommandQueue::create_with_properties(
&context,
context.default_device(),
CL_QUEUE_PROFILING_ENABLE,
0,
)
.expect("CommandQueue::create failed");

let kernel_cl = r#"
__kernel void gemm_0(const int M, const int K, const int N,
Expand Down Expand Up @@ -150,6 +154,7 @@ impl Gpu {
a: &[f32],
b: &[f32],
c: &mut [f32],
local_sizes: Option<(usize, usize)>,
) -> Result<(), ClError> {
let mut a_cl =
Buffer::<cl_float>::create(&self.context, CL_MEM_READ_ONLY, m * k, null_mut())?;
Expand All @@ -162,17 +167,19 @@ impl Gpu {
let write_a = self.queue.enqueue_write_buffer(&mut a_cl, CL_NON_BLOCKING, 0, a, &[])?;
let write_b = self.queue.enqueue_write_buffer(&mut b_cl, CL_NON_BLOCKING, 0, b, &[])?;

let run = ExecuteKernel::new(&self.kernel)
.set_arg(&(m as i32))
let mut run = ExecuteKernel::new(&self.kernel);
run.set_arg(&(m as i32))
.set_arg(&(k as i32))
.set_arg(&(n as i32))
.set_arg(&a_cl)
.set_arg(&b_cl)
.set_arg(&c_cl)
.set_global_work_sizes(&[m / self.mr, n / self.nr])
.set_local_work_sizes(&[2, 2])
.set_event_wait_list(&[write_a.get(), write_b.get()])
.enqueue_nd_range(&self.queue)?;
.set_event_wait_list(&[write_a.get(), write_b.get()]);
if let Some((mr, nr)) = local_sizes {
run.set_local_work_sizes(&[mr, nr]);
}
let run = run.enqueue_nd_range(&self.queue).unwrap();

let read_c =
self.queue.enqueue_read_buffer(&mut c_cl, CL_NON_BLOCKING, 0, c, &[run.get()])?;
Expand All @@ -181,12 +188,44 @@ impl Gpu {
}
}

lazy_static::lazy_static! {
static ref GPU: Mutex<std::collections::HashMap::<&'static str, Gpu>> = Default::default();
#[allow(non_upper_case_globals)]
mod kernels {
pub use super::*;
use std::sync::Mutex;

macro_rules! kernel {
($id:ident, $mr: expr, $nr: expr) => {
lazy_static::lazy_static! {
pub static ref $id: Mutex<Gpu> = {
Mutex::new(Gpu::create(stringify!($id), $mr, $nr))
};
}
};
}

kernel!(gemm_1, 4, 4);
}

pub fn opencl_gemm1(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32]) {
kernels::gemm_1.lock().unwrap().run(m, k, n, a, b, c, None).unwrap();
}

pub fn opencl_gemm_1_with_local_2x2(
m: usize,
k: usize,
n: usize,
a: &[f32],
b: &[f32],
c: &mut [f32],
) {
kernels::gemm_1.lock().unwrap().run(m, k, n, a, b, c, Some((2, 2))).unwrap();
}

pub fn run(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32]) {
let mut gpus = GPU.lock().unwrap();
let gpu = gpus.entry("gemm_1").or_insert_with(|| Gpu::create("gemm_1", 4, 4));
gpu.run(m, k, n, a, b, c).unwrap();
#[cfg(test)]
mod test {
use super::*;
use crate::t;

t!(opencl_gemm1);
t!(opencl_gemm_1_with_local_2x2);
}

0 comments on commit 6f34d8a

Please sign in to comment.