-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use non-const
libraries only on Musl systems with Julia v1.6
#31
base: master
Are you sure you want to change the base?
Conversation
172e024
to
415f381
Compare
The non-`const` library variable generates worse code because of the presence of a trampoline, but the non-`const` is really needed only for Musl-based systems.
Uhm, no, I don't think we actually test those features 🤔 |
In general, I'm leery of changing how things work by platform, if possible. Have we quantified how much of a performance impact this has? Our eventual goal is to replace the non-const |
We didn't quantify a performance impact, but Valentin was worried about LLVM code. Currently we have: julia> code_llvm(SpecialFunctions._besselj, (Float64, ComplexF64, Int32); debuginfo=:none)
define void @julia__besselj_2508([2 x double]* noalias nocapture sret %0, double %1, [2 x double]* nocapture nonnull readonly align 8 dereferenceable(16) %2, i32 signext %3) {
top:
%4 = alloca {}*, align 8
%gcframe4 = alloca [4 x {}*], align 16
%gcframe4.sub = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe4, i64 0, i64 0
%5 = bitcast [4 x {}*]* %gcframe4 to i8*
call void @llvm.memset.p0i8.i32(i8* nonnull align 16 dereferenceable(32) %5, i8 0, i32 32, i1 false)
%6 = alloca i64, align 16
%7 = bitcast i64* %6 to i8*
%8 = alloca i64, align 16
%9 = bitcast i64* %8 to i8*
%10 = alloca i32, align 8
%11 = bitcast i32* %10 to i8*
%12 = alloca i64, align 16
%13 = bitcast i64* %12 to i8*
%14 = alloca i64, align 16
%15 = bitcast i64* %14 to i8*
%16 = alloca i64, align 16
%17 = bitcast i64* %16 to i8*
%18 = alloca i32, align 8
%19 = bitcast i32* %18 to i8*
%20 = alloca i32, align 8
%21 = bitcast i32* %20 to i8*
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #9
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -32768
%22 = bitcast [4 x {}*]* %gcframe4 to i64*
store i64 8, i64* %22, align 16
%23 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe4, i64 0, i64 1
%24 = bitcast i8* %ptls_i8 to i64*
%25 = load i64, i64* %24, align 8
%26 = bitcast {}** %23 to i64*
store i64 %25, i64* %26, align 8
%27 = bitcast i8* %ptls_i8 to {}***
store {}** %gcframe4.sub, {}*** %27, align 8
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %7)
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %9)
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11)
%28 = call noalias nonnull {}* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1400, i32 16) #1
%29 = bitcast {}* %28 to i64*
%30 = getelementptr inbounds i64, i64* %29, i64 -1
store atomic i64 139940881979936, i64* %30 unordered, align 8
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %13)
%31 = bitcast [2 x double]* %2 to i64*
%32 = load i64, i64* %31, align 8
store i64 %32, i64* %12, align 16
%33 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %15)
%34 = bitcast double* %33 to i64*
%35 = load i64, i64* %34, align 8
store i64 %35, i64* %14, align 16
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %17)
%36 = bitcast i64* %16 to double*
store double %1, double* %36, align 16
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %19)
store i32 %3, i32* %18, align 8
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %21)
store i32 1, i32* %20, align 8
%37 = bitcast {}* %28 to i8*
%38 = load atomic void ()*, void ()** @libname_zbesj__2510 unordered, align 8
%.not = icmp eq void ()* %38, null
br i1 %.not, label %dlsym, label %ccall
L41: ; preds = %ccall, %ccall
%39 = load i64, i64* %6, align 16
%40 = load i64, i64* %8, align 16
%41 = bitcast [2 x double]* %0 to i64*
store i64 %39, i64* %41, align 8
%.sroa.2.0..sroa_idx1 = getelementptr inbounds [2 x double], [2 x double]* %0, i64 0, i64 1
%42 = bitcast double* %.sroa.2.0..sroa_idx1 to i64*
store i64 %40, i64* %42, align 8
%43 = load i64, i64* %26, align 8
store i64 %43, i64* %24, align 8
ret void
L45: ; preds = %ccall
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %7)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %9)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %11)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %13)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %15)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %17)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %19)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %21)
store {}* %28, {}** %4, align 8
%44 = call nonnull {}* @jl_apply_generic({}* inttoptr (i64 139940936404736 to {}*), {}** nonnull %4, i32 1)
%45 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe4, i64 0, i64 2
store {}* %44, {}** %45, align 16
store {}* %44, {}** %4, align 8
%46 = call nonnull {}* @jl_apply_generic({}* inttoptr (i64 139938995061616 to {}*), {}** nonnull %4, i32 1)
call void @jl_throw({}* %46)
unreachable
dlsym: ; preds = %top
%47 = load atomic {}*, {}** inttoptr (i64 139938985903304 to {}**) unordered, align 8
%48 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe4, i64 0, i64 3
store {}* %28, {}** %48, align 8
%49 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe4, i64 0, i64 2
store {}* %47, {}** %49, align 16
%50 = call void ()* @jl_lazy_load_and_lookup({}* %47, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @_j_str1, i64 0, i64 0))
store atomic void ()* %50, void ()** @libname_zbesj__2510 release, align 8
br label %ccall
ccall: ; preds = %dlsym, %top
%51 = phi void ()* [ %38, %top ], [ %50, %dlsym ]
%52 = bitcast void ()* %51 to void (i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*)*
%53 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe4, i64 0, i64 3
store {}* %28, {}** %53, align 8
call void %52(i8* nonnull %13, i8* nonnull %15, i8* nonnull %17, i8* nonnull %19, i8* nonnull %21, i8* nonnull %7, i8* nonnull %9, i8* nonnull %11, i8* nonnull %37)
%54 = bitcast {}* %28 to i32*
%55 = load i32, i32* %54, align 4
switch i32 %55, label %L45 [
i32 0, label %L41
i32 3, label %L41
]
} and in particular julia> io = IOBuffer(); code_llvm(io, SpecialFunctions._besselj, (Float64, ComplexF64, Int32); debuginfo=:none)
julia> occursin("jl_lazy_load_and_lookup", String(take!(io)))
true With this PR we have julia> code_llvm(SpecialFunctions._besselj, (Float64, ComplexF64, Int32); debuginfo=:none)
define void @julia__besselj_343([2 x double]* noalias nocapture sret %0, double %1, [2 x double]* nocapture nonnull readonly align 8 dereferenceable(16) %2, i32 signext %3) {
top:
%4 = alloca {}*, align 8
%gcframe4 = alloca [3 x {}*], align 16
%gcframe4.sub = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe4, i64 0, i64 0
%5 = bitcast [3 x {}*]* %gcframe4 to i8*
call void @llvm.memset.p0i8.i32(i8* nonnull align 16 dereferenceable(24) %5, i8 0, i32 24, i1 false)
%6 = alloca i64, align 16
%7 = bitcast i64* %6 to i8*
%8 = alloca i64, align 16
%9 = bitcast i64* %8 to i8*
%10 = alloca i32, align 8
%11 = bitcast i32* %10 to i8*
%12 = alloca i64, align 16
%13 = bitcast i64* %12 to i8*
%14 = alloca i64, align 16
%15 = bitcast i64* %14 to i8*
%16 = alloca i64, align 16
%17 = bitcast i64* %16 to i8*
%18 = alloca i32, align 8
%19 = bitcast i32* %18 to i8*
%20 = alloca i32, align 8
%21 = bitcast i32* %20 to i8*
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #9
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -32768
%22 = bitcast [3 x {}*]* %gcframe4 to i64*
store i64 4, i64* %22, align 16
%23 = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe4, i64 0, i64 1
%24 = bitcast i8* %ptls_i8 to i64*
%25 = load i64, i64* %24, align 8
%26 = bitcast {}** %23 to i64*
store i64 %25, i64* %26, align 8
%27 = bitcast i8* %ptls_i8 to {}***
store {}** %gcframe4.sub, {}*** %27, align 8
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %7)
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %9)
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11)
%28 = call noalias nonnull {}* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1400, i32 16) #1
%29 = bitcast {}* %28 to i64*
%30 = getelementptr inbounds i64, i64* %29, i64 -1
store atomic i64 140483444341280, i64* %30 unordered, align 8
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %13)
%31 = bitcast [2 x double]* %2 to i64*
%32 = load i64, i64* %31, align 8
store i64 %32, i64* %12, align 16
%33 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %15)
%34 = bitcast double* %33 to i64*
%35 = load i64, i64* %34, align 8
store i64 %35, i64* %14, align 16
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %17)
%36 = bitcast i64* %16 to double*
store double %1, double* %36, align 16
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %19)
store i32 %3, i32* %18, align 8
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %21)
store i32 1, i32* %20, align 8
%37 = bitcast {}* %28 to i8*
%38 = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe4, i64 0, i64 2
store {}* %28, {}** %38, align 16
call void inttoptr (i64 140482025756496 to void (i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*)*)(i8* nonnull %13, i8* nonnull %15, i8* nonnull %17, i8* nonnull %19, i8* nonnull %21, i8* nonnull %7, i8* nonnull %9, i8* nonnull %11, i8* nonnull %37)
%39 = bitcast {}* %28 to i32*
%40 = load i32, i32* %39, align 4
switch i32 %40, label %L45 [
i32 0, label %L41
i32 3, label %L41
]
L41: ; preds = %top, %top
%41 = load i64, i64* %6, align 16
%42 = load i64, i64* %8, align 16
%43 = bitcast [2 x double]* %0 to i64*
store i64 %41, i64* %43, align 8
%.sroa.2.0..sroa_idx1 = getelementptr inbounds [2 x double], [2 x double]* %0, i64 0, i64 1
%44 = bitcast double* %.sroa.2.0..sroa_idx1 to i64*
store i64 %42, i64* %44, align 8
%45 = load i64, i64* %26, align 8
store i64 %45, i64* %24, align 8
ret void
L45: ; preds = %top
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %7)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %9)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %11)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %13)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %15)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %17)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %19)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %21)
store {}* %28, {}** %4, align 8
%46 = call nonnull {}* @jl_apply_generic({}* inttoptr (i64 140483498766080 to {}*), {}** nonnull %4, i32 1)
store {}* %46, {}** %38, align 16
store {}* %46, {}** %4, align 8
%47 = call nonnull {}* @jl_apply_generic({}* inttoptr (i64 140483381579824 to {}*), {}** nonnull %4, i32 1)
call void @jl_throw({}* %47)
unreachable
} and in particular julia> io = IOBuffer(); code_llvm(io, SpecialFunctions._besselj, (Float64, ComplexF64, Int32); debuginfo=:none)
julia> occursin("jl_lazy_load_and_lookup", String(take!(io)))
false |
The non-
const
library variable generates worse code because of the presence ofa trampoline, but the non-
const
is really needed only for Musl-based systems.CC: @vchuravy