Skip to content

Commit

Permalink
fix byteswap for VLEN>256, and add initial CI
Browse files Browse the repository at this point in the history
  • Loading branch information
camel-cdr committed Jul 5, 2024
1 parent e433877 commit 3595610
Show file tree
Hide file tree
Showing 12 changed files with 111 additions and 36 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/bench-config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#define HAS_E64 (__riscv_v_elen >= 64)
#define HAS_F16 1
#define MAX_MEM (4096*8)
#define NEXT(c) (c + c/3 + 3)
#define VALIDATE 1
#define MIN_REPEATS 2
#define MAX_REPEATS 2

#define STOP_CYCLES (1024*1024*500)
#define SCALE_mandelbrot(N) ((N)/10)
#define SCALE_mergelines(N) ((N)/10)
#define mandelbrot_ITER 100
3 changes: 3 additions & 0 deletions .github/workflows/rv32-config.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter
CC=clang-17
CFLAGS=--target=riscv32 -march=rv32gc_zve32f_zfh_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -nodefaultlibs -ffreestanding
9 changes: 9 additions & 0 deletions .github/workflows/rv32-run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/sh

qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \
qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \
qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=512,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \
qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ &&\
qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=off,rvv_ma_all_1s=off,zfh=true,x-zvfh=true $@ && \
qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=off,rvv_ma_all_1s=off,zfh=true,x-zvfh=true $@

3 changes: 3 additions & 0 deletions .github/workflows/rv64-config.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter
CC=clang-17
CFLAGS=--target=riscv64 -march=rv64gcv_zfh_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -nodefaultlibs -ffreestanding
8 changes: 8 additions & 0 deletions .github/workflows/rv64-run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh

qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \
qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \
qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=512,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \
qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \
qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=off,rvv_ma_all_1s=off,zfh=true,x-zvfh=true $@ && \
qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=off,rvv_ma_all_1s=off,zfh=true,x-zvfh=true $@
32 changes: 32 additions & 0 deletions .github/workflows/validate-bench.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Validate bench

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
Tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install packages
run: |
sudo sed 's/jammy/devel/g' -i /etc/apt/sources.list
sudo apt-get update -q -y
sudo apt-get install -y make qemu-user-static gcc-riscv64-linux-gnu clang-17
- name: Validate RV64
run: |
cp .github/workflows/rv64-config.mk ./config.mk
cp .github/workflows/rv64-run.sh ./run.sh
cp .github/workflows/bench-config.h ./bench/config.h
make -C bench run -j$(nproc)
make -C bench clean
- name: Validate RV32
run: |
cp .github/workflows/rv32-config.mk ./config.mk
cp .github/workflows/rv32-run.sh ./run.sh
cp .github/workflows/bench-config.h ./bench/config.h
make -C bench run -j$(nproc)
make -C bench clean
2 changes: 1 addition & 1 deletion bench/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ clean:
rm -f ${EXECS}

run: all
for i in ${EXECS}; do ../run.sh ./$$i; done
for i in ${EXECS}; do ../run.sh ./$$i || { printf "\n\n\033[0;31mFAILED\033[0m\n\n"; exit 1; } ; done

2 changes: 1 addition & 1 deletion bench/bench.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ bench_run(Bench *benches, size_t nBenches)
for (size_t n = 1; n < N; n = BENCH_NEXT(n)) {
ux si = 0, s0 = 0;

#if MAX_REPEATS > 4
#if VALIDATE
if (i != b->impls) {
URand seed = randState;
(void)b->func(i->func, n);
Expand Down
49 changes: 22 additions & 27 deletions bench/byteswap.S
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
/*
* TODO: This currently only works for VLEN<=256.
* I think rvv 1.0 should only vrgatherei16.vv here in the future.
*/

#ifdef MX

#if HAS_RVV_1_0
#if MX_N == 4 || MX_N == 2 || MX_N == 1

# a0 = ptr, a1 = len
.global MX(byteswap32_rvv_gather_)
MX(byteswap32_rvv_gather_):
vsetvli t0, x0, e8, MX(), ta, ma
.global MX(byteswap32_rvv_gatherei16_)
MX(byteswap32_rvv_gatherei16_):
vsetvli t0, x0, e16, MX2(), ta, ma
vid.v v0
vand.vi v8, v0, 3
vrsub.vi v8, v8, 3
Expand All @@ -21,7 +16,7 @@ MX(byteswap32_rvv_gather_):
vle32.v v8, (a0)
slli t1, t0, 2
vsetvli x0, t1, e8, MX(), ta, ma
vrgather.vv v16, v8, v0
vrgatherei16.vv v16, v8, v0
vsetvli x0, t0, e32, MX(), ta, ma
vse32.v v16, (a0)
sub a1, a1, t0
Expand All @@ -32,10 +27,10 @@ MX(byteswap32_rvv_gather_):

#if MX_N == 2

.macro byteswap32_rvv_m1_gathers n
.global byteswap32_rvv_m1_gathers_m\n
byteswap32_rvv_m1_gathers_m\n:
vsetvli t0, x0, e8, m1, ta, ma
.macro byteswap32_rvv_m1_gatherei16s n
.global byteswap32_rvv_m1_gatherei16s_m\n
byteswap32_rvv_m1_gatherei16s_m\n:
vsetvli t0, x0, e16, MX(), ta, ma
vid.v v0
vand.vi v8, v0, 3
vrsub.vi v8, v8, 3
Expand All @@ -46,17 +41,17 @@ MX(byteswap32_rvv_gather_):
vsetvli t0, a1, e32, m\n, ta, ma
vle32.v v8, (a0)
vsetvli t1, x0, e8, m1, ta, ma
vrgather.vv v16, v8, v0
vrgatherei16.vv v16, v8, v0
.ifge \n-2
vrgather.vv v17, v9, v0
vrgatherei16.vv v17, v9, v0
.ifge \n-4
vrgather.vv v18, v10, v0
vrgather.vv v19, v11, v0
vrgatherei16.vv v18, v10, v0
vrgatherei16.vv v19, v11, v0
.ifge \n-8
vrgather.vv v20, v12, v0
vrgather.vv v21, v13, v0
vrgather.vv v22, v14, v0
vrgather.vv v23, v15, v0
vrgatherei16.vv v20, v12, v0
vrgatherei16.vv v21, v13, v0
vrgatherei16.vv v22, v14, v0
vrgatherei16.vv v23, v15, v0
.endif
.endif
.endif
Expand All @@ -69,13 +64,13 @@ MX(byteswap32_rvv_gather_):
ret
.endm

byteswap32_rvv_m1_gathers 2
byteswap32_rvv_m1_gatherei16s 2
#endif
#if MX_N == 4
byteswap32_rvv_m1_gathers 4
byteswap32_rvv_m1_gatherei16s 4
#endif
#if MX_N == 8
byteswap32_rvv_m1_gathers 8
byteswap32_rvv_m1_gatherei16s 8
#endif


#endif
18 changes: 14 additions & 4 deletions bench/byteswap.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,24 @@ byteswap32_SWAR_rev8(uint32_t *ptr, size_t n)
#endif


/* we don't support these on XTheadVector */
#ifndef __riscv_vector
#define IMPLS_RVV(f)
#else
#define IMPLS_RVV(f) \
f(rvv_gatherei16_m1) \
f(rvv_gatherei16_m2) \
f(rvv_gatherei16_m4) \
f(rvv_m1_gatherei16s_m2) \
f(rvv_m1_gatherei16s_m4) \
f(rvv_m1_gatherei16s_m8)
#endif

#define IMPLS(f) \
f(scalar) \
f(scalar_autovec) \
REV8(f) \
MX(f, rvv_gather) \
f(rvv_m1_gathers_m2) \
f(rvv_m1_gathers_m4) \
f(rvv_m1_gathers_m8) \
IMPLS_RVV(f)

typedef void Func(uint32_t *ptr, size_t n);

Expand Down
3 changes: 3 additions & 0 deletions bench/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
/* stop repeats early afer this many cycles have elapsed */
#define STOP_CYCLES (1024*1024*500)

/* validate against reference implementation on the first repetition */
#define VALIDATE 1

/* custom scaling factors for benchmarks, these are used to make sure each
* benchmark approximately takes the same amount of time. */
Expand All @@ -24,3 +26,4 @@

/* benchmark specific configurations */
#define mandelbrot_ITER 100

6 changes: 3 additions & 3 deletions nolibc.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ static void print_flush(void);
#define EXIT_SUCCESS 0

/* customize me */
static void
exit(int x) { __asm volatile("unimp\n"); }

static void
memwrite(void const *ptr, size_t len) { }

// static size_t /* only needed for vector-utf/bench.c */
// memread(void *ptr, size_t len) { }

static void
exit(int x) { __asm volatile("unimp\n"); }

int main(void);

void _start(void) {
Expand Down

0 comments on commit 3595610

Please sign in to comment.