Skip to content

Commit afb147e

Browse files
authored
Make constant memory opt-in, spill large statics to global memory (#217)
* Allow address spaces to propagate to LLVM This looks like it was code that wasn't deleted after the refactor in decda87 * Spill large statics from constant to global memory This isn't fully correct, as ideally we keep track of what we have put into constant memory and when it is filled up spill instdead of only spilling when a static is big. But, this is materially better than what is there (which is a runtime error). An argument can be made to just _always_ use global memory and we don't have to worry about getting the packing right. Fixes #208. See also the debugging and discussion in #216 * Add `--use-constant-memory-space` flag, off by default * Make it clear that `#[cuda_std::address_space(constant)]` still works
1 parent 3b3e049 commit afb147e

File tree

3 files changed

+68
-14
lines changed

3 files changed

+68
-14
lines changed

crates/cuda_builder/src/lib.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,21 @@ pub struct CudaBuilder {
130130
///
131131
/// `true` by default.
132132
pub override_libm: bool,
133+
/// If `true`, the codegen will attempt to place `static` variables in CUDA's
134+
/// constant memory, which is fast but limited in size (~64KB total across all
135+
/// statics). The codegen avoids placing any single item too large, but it does not
136+
/// track cumulative size. Exceeding the limit may cause `IllegalAddress` runtime
137+
/// errors (CUDA error code: `700`).
138+
///
139+
/// The default is `false`, which places all statics in global memory. This avoids
140+
/// such errors but may reduce performance and use more general memory. When set to
141+
/// `false`, you can still annotate `static` variables with
142+
/// `#[cuda_std::address_space(constant)]` to place them in constant memory
143+
/// manually. This option only affects automatic placement.
144+
///
145+
/// Future versions may support smarter placement and user-controlled
146+
/// packing/spilling strategies.
147+
pub use_constant_memory_space: bool,
133148
/// Whether to generate any debug info and what level of info to generate.
134149
pub debug: DebugInfo,
135150
/// Additional arguments passed to cargo during `cargo build`.
@@ -155,6 +170,7 @@ impl CudaBuilder {
155170
emit: None,
156171
optix: false,
157172
override_libm: true,
173+
use_constant_memory_space: false,
158174
debug: DebugInfo::None,
159175
build_args: vec![],
160176
final_module_path: None,
@@ -284,6 +300,24 @@ impl CudaBuilder {
284300
self
285301
}
286302

303+
/// If `true`, the codegen will attempt to place `static` variables in CUDA's
304+
/// constant memory, which is fast but limited in size (~64KB total across all
305+
/// statics). The codegen avoids placing any single item too large, but it does not
306+
/// track cumulative size. Exceeding the limit may cause `IllegalAddress` runtime
307+
/// errors (CUDA error code: `700`).
308+
///
309+
/// If `false`, all statics are placed in global memory. This avoids such errors but
310+
/// may reduce performance and use more general memory. You can still annotate
311+
/// `static` variables with `#[cuda_std::address_space(constant)]` to place them in
312+
/// constant memory manually as this option only affects automatic placement.
313+
///
314+
/// Future versions may support smarter placement and user-controlled
315+
/// packing/spilling strategies.
316+
pub fn use_constant_memory_space(mut self, use_constant_memory_space: bool) -> Self {
317+
self.use_constant_memory_space = use_constant_memory_space;
318+
self
319+
}
320+
287321
/// An optional path where to dump LLVM IR of the final output the codegen will feed to libnvvm. Usually
288322
/// used for debugging.
289323
pub fn final_module_path(mut self, path: impl AsRef<Path>) -> Self {

crates/rustc_codegen_nvvm/src/builder.rs

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,18 +1154,7 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
11541154

11551155
impl<'ll> StaticBuilderMethods for Builder<'_, 'll, '_> {
11561156
fn get_static(&mut self, def_id: DefId) -> &'ll Value {
1157-
unsafe {
1158-
let mut g = self.cx.get_static(def_id);
1159-
let llty = self.val_ty(g);
1160-
let addrspace = AddressSpace(llvm::LLVMGetPointerAddressSpace(llty));
1161-
if addrspace != AddressSpace::DATA {
1162-
trace!("Remapping global address space of global {:?}", g);
1163-
let llty = llvm::LLVMGetElementType(llty);
1164-
let ty = self.type_ptr_to_ext(llty, AddressSpace::DATA);
1165-
g = llvm::LLVMBuildAddrSpaceCast(self.llbuilder, g, ty, unnamed());
1166-
}
1167-
g
1168-
}
1157+
self.cx.get_static(def_id)
11691158
}
11701159
}
11711160

crates/rustc_codegen_nvvm/src/context.rs

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use rustc_errors::DiagMessage;
2222
use rustc_hash::FxHashMap;
2323
use rustc_middle::dep_graph::DepContext;
2424
use rustc_middle::ty::layout::{
25-
FnAbiError, FnAbiOf, FnAbiRequest, HasTyCtxt, HasTypingEnv, LayoutError,
25+
FnAbiError, FnAbiOf, FnAbiRequest, HasTyCtxt, HasTypingEnv, LayoutError, LayoutOf,
2626
};
2727
use rustc_middle::ty::layout::{FnAbiOfHelpers, LayoutOfHelpers};
2828
use rustc_middle::ty::{Ty, TypeVisitableExt};
@@ -40,6 +40,10 @@ use rustc_target::callconv::FnAbi;
4040
use rustc_target::spec::{HasTargetSpec, Target};
4141
use tracing::{debug, trace};
4242

43+
/// "There is a total of 64 KB constant memory on a device."
44+
/// <https://docs.nvidia.com/cuda/archive/12.8.1/pdf/CUDA_C_Best_Practices_Guide.pdf>
45+
const CONSTANT_MEMORY_SIZE_LIMIT_BYTES: u64 = 64 * 1024;
46+
4347
pub(crate) struct CodegenCx<'ll, 'tcx> {
4448
pub tcx: TyCtxt<'tcx>,
4549

@@ -267,7 +271,31 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
267271
}
268272

269273
if !is_mutable && self.type_is_freeze(ty) {
270-
AddressSpace(4)
274+
if !self.codegen_args.use_constant_memory_space {
275+
// We aren't using constant memory, so put the instance in global memory.
276+
AddressSpace(1)
277+
} else {
278+
// We are using constant memory, see if the instance will fit.
279+
//
280+
// FIXME(@LegNeato) ideally we keep track of what we have put into
281+
// constant memory and when it is filled up spill instead of only
282+
// spilling when a static is big. We'll probably want some packing
283+
// strategy controlled by the user...for example, if you have one large
284+
// static and many small ones, you might want the small ones to all be
285+
// in constant memory or just the big one depending on your workload.
286+
let layout = self.layout_of(ty);
287+
if layout.size.bytes() > CONSTANT_MEMORY_SIZE_LIMIT_BYTES {
288+
self.tcx.sess.dcx().warn(format!(
289+
"static `{}` exceeds the constant memory limit; placing in global memory (performance may be reduced)",
290+
instance
291+
));
292+
// Place instance in global memory if it is too big for constant memory.
293+
AddressSpace(1)
294+
} else {
295+
// Place instance in constant memory if it fits.
296+
AddressSpace(4)
297+
}
298+
}
271299
} else {
272300
AddressSpace::DATA
273301
}
@@ -519,6 +547,7 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
519547
pub struct CodegenArgs {
520548
pub nvvm_options: Vec<NvvmOption>,
521549
pub override_libm: bool,
550+
pub use_constant_memory_space: bool,
522551
pub final_module_path: Option<PathBuf>,
523552
}
524553

@@ -537,6 +566,8 @@ impl CodegenArgs {
537566
cg_args.nvvm_options.push(flag);
538567
} else if arg == "--override-libm" {
539568
cg_args.override_libm = true;
569+
} else if arg == "--use-constant-memory-space" {
570+
cg_args.use_constant_memory_space = true;
540571
} else if arg == "--final-module-path" {
541572
cg_args.final_module_path = Some(PathBuf::from(
542573
args.get(idx + 1).expect("No path for --final-module-path"),

0 commit comments

Comments
 (0)