diff --git a/.gitmodules b/.gitmodules index 9addb59..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "bench/rvv-chacha-poly"] - path = bench/rvv-chacha-poly - url = https://github.com/edre/rvv-chacha-poly diff --git a/bench/chacha20.S b/bench/chacha20.S index 1e84dca..038b954 100644 --- a/bench/chacha20.S +++ b/bench/chacha20.S @@ -1,5 +1,5 @@ #ifndef MX #if __riscv_xlen != 32 -#include "rvv-chacha-poly/vchacha.s" +#include "../thirdparty/rvv-chacha-poly/vchacha.s" #endif #endif diff --git a/bench/chacha20.c b/bench/chacha20.c index 00ccf6e..ea5cb3c 100644 --- a/bench/chacha20.c +++ b/bench/chacha20.c @@ -1,6 +1,6 @@ #include "bench.h" #if __riscv_xlen != 32 -#include "../thirdparty/boring.h" +#include "../thirdparty/rvv-chacha-poly/boring.h" uint8_t *dest, *src; uint8_t key[32], nonce[12]; @@ -52,7 +52,7 @@ Bench benches[] = { }; BENCH_MAIN(benches) -#include "../thirdparty/boring.c" +#include "../thirdparty/rvv-chacha-poly/boring.c" #else void init(void) {} Impl impls[] = {}; diff --git a/bench/poly1305.S b/bench/poly1305.S index 8658971..3e8ac2d 100644 --- a/bench/poly1305.S +++ b/bench/poly1305.S @@ -1,5 +1,5 @@ #ifndef MX #if __riscv_xlen != 32 -#include "rvv-chacha-poly/vpoly.s" +#include "../thirdparty/rvv-chacha-poly/vpoly.s" #endif #endif diff --git a/bench/poly1305.c b/bench/poly1305.c index ef8fb1c..4a56c58 100644 --- a/bench/poly1305.c +++ b/bench/poly1305.c @@ -1,6 +1,6 @@ #include "bench.h" #if __riscv_xlen != 32 -#include "../thirdparty/boring.h" +#include "../thirdparty/rvv-chacha-poly/boring.h" uint8_t *src; uint8_t key[32], sig[16]; @@ -55,7 +55,7 @@ Bench benches[] = { }; BENCH_MAIN(benches) -#include "../thirdparty/boring.c" +#include "../thirdparty/rvv-chacha-poly/boring.c" #else void init(void) {} Impl impls[] = {}; diff --git a/bench/rvv-chacha-poly b/bench/rvv-chacha-poly deleted file mode 160000 index 7cffc88..0000000 --- a/bench/rvv-chacha-poly +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7cffc882d35f36a355e83b35d9815f86e0a5598d diff --git a/thirdparty/rvv-chacha-poly/CONTRIBUTING.md b/thirdparty/rvv-chacha-poly/CONTRIBUTING.md new file mode 100644 index 0000000..a121ba3 --- /dev/null +++ b/thirdparty/rvv-chacha-poly/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# How to Contribute + +I can accept your patches and contributions to this project with the +following caveats from my employer: + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Community Guidelines + +Treat people with respect. diff --git a/thirdparty/rvv-chacha-poly/LICENSE b/thirdparty/rvv-chacha-poly/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/thirdparty/rvv-chacha-poly/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/thirdparty/rvv-chacha-poly/README.md b/thirdparty/rvv-chacha-poly/README.md new file mode 100644 index 0000000..d04a90e --- /dev/null +++ b/thirdparty/rvv-chacha-poly/README.md @@ -0,0 +1,15 @@ +NOTE: code from https://github.com/edre/rvv-chacha-poly + +RISC-V vector extension implementation of chacha20 and poly1305 +cryptographic primitives. + +Chacha20 and poly1305 are simple to vectorize without specialized +instructions. This project implements them in assembly, and verifies them +against the BoringSSL C implementation. As expected the executed instruction +count go down a lot, but I don't have real hardware to see if the runtime does +too. + +This is not an officially supported Google product. + +This is a proof of concept crypto library. Those words should sound very scary +together. Don't use this. diff --git a/thirdparty/boring.c b/thirdparty/rvv-chacha-poly/boring.c similarity index 100% rename from thirdparty/boring.c rename to thirdparty/rvv-chacha-poly/boring.c diff --git a/thirdparty/boring.h b/thirdparty/rvv-chacha-poly/boring.h similarity index 100% rename from thirdparty/boring.h rename to thirdparty/rvv-chacha-poly/boring.h diff --git a/thirdparty/rvv-chacha-poly/main.c b/thirdparty/rvv-chacha-poly/main.c new file mode 100644 index 0000000..265a9b8 --- /dev/null +++ b/thirdparty/rvv-chacha-poly/main.c @@ -0,0 +1,195 @@ +/* Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License") ; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include +#include +#include +#include +#include "boring.h" + +void println_hex(uint8_t* data, int size) { + while (size > 0) { + printf("%02x", *data); + data++; + size--; + } + printf("\n"); +} + +// TODO: test the vector doesn't write past the end +// test function with multiple length inputs (optional printing) +// test non-block sized lengths + +extern uint64_t instruction_counter(); + +const char* pass_str = "\x1b[32mPASS\x1b[0m"; +const char* fail_str = "\x1b[31mFAIL\x1b[0m"; + +bool test_chacha(const uint8_t* data, size_t len, const uint8_t key[32], const uint8_t nonce[12], bool verbose) { + extern void vector_chacha20(uint8_t *out, const uint8_t *in, + size_t in_len, const uint8_t key[32], + const uint8_t nonce[12], uint32_t counter); + uint8_t* golden = malloc(len); + memset(golden, 0, len); + uint64_t start = instruction_counter(); + boring_chacha20(golden, data, len, key, nonce, 0); + uint64_t end = instruction_counter(); + uint64_t boring_count = end - start; + + uint8_t* vector = malloc(len + 4); + memset(vector, 0, len+4); + start = instruction_counter(); + vector_chacha20(vector, data, len, key, nonce, 0); + end = instruction_counter(); + + bool pass = memcmp(golden, vector, len) == 0; + + if (verbose || !pass) { + printf("golden: "); + println_hex(golden, 32); + printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len); + printf("vector: "); + println_hex(vector, 32); + printf("inst_count=%d, inst/byte=%.02f\n", end - start, (float)(end - start)/len); + } + + uint32_t past_end = vector[len]; + if (past_end != 0) { + printf("vector wrote past end %08x\n", past_end); + pass = false; + } + + free(golden); + free(vector); + + return pass; +} + +void test_chachas(FILE* f) { + int len = 1024 - 11; + uint8_t* data = malloc(len); + uint32_t rand = 1; + for (int i = 0; i < len; i++) { + rand *= 101; + rand %= 16777213; // random prime + data[i] = (uint8_t)(rand); + } + uint8_t key[32] = "Setec astronomy;too many secrets"; + uint8_t nonce[12] = "BurnAfterUse"; + int counter = 0; + + bool pass = test_chacha(data, len, key, nonce, true); + + if (pass) { + for (int i = 1, len = 1; len < 1000; len += i++) { + fread(key, 32, 1, f); + fread(nonce, 12, 1, f); + if (!test_chacha(data, len, key, nonce, false)) { + printf("Failed with len=%d\n", len); + pass = false; + break; + } + } + } + + if (pass) { + printf("chacha %s\n", pass_str); + } else { + printf("chacha %s\n", fail_str); + } +} + +bool test_poly(const uint8_t* data, size_t len, const uint8_t key[32], bool verbose) { + extern uint64_t vector_poly1305(const uint8_t* in, size_t len, + const uint8_t key[32], uint8_t sig[16]); + + poly1305_state state; + uint8_t *sig = malloc(16); // gets corrupted if I define it on the stack? + uint64_t start = instruction_counter(); + boring_poly1305_init(&state, key); + boring_poly1305_update(&state, data, len); + boring_poly1305_finish(&state, sig); + uint64_t end = instruction_counter(); + uint64_t boring_count = end - start; + + uint8_t *sig2 = malloc(16); + start = instruction_counter(); + uint64_t mid = vector_poly1305(data, len, key, sig2); + end = instruction_counter(); + + bool pass = memcmp(sig, sig2, 16) == 0; + + if (verbose || !pass) { + printf("boring mac: "); + println_hex(sig, 16); + printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len); + printf("vector mac: "); + println_hex(sig2, 16); + printf("precomputation=%d, processing=%d, inst/byte=%.02f\n", + mid - start, end - mid, (float)(end - mid)/len); + } + + free(sig); + free(sig2); + return pass; +} + +void test_polys(FILE* f) { + const int big_len = 1024; + uint8_t *zero = malloc(2000); + uint8_t *max_bits = malloc(big_len); + memset(max_bits, 0xff, big_len); + const uint8_t one[32] = {1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + const uint8_t key[32] = {1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 255, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + const uint8_t data[272] = "Setec astronomy;too many secrets"; + bool pass = test_poly(max_bits, big_len, max_bits, true); + + if (!pass) + goto end; + + // random test + const int max_len = 1000; + uint8_t *rand = malloc(max_len); + for (int len = 16; len <= max_len; len += 16) { + fread((uint8_t*)key, 32, 1, f); + fread((uint8_t*)rand, len, 1, f); + if (!test_poly(data, len, key, false)) { + printf("failed random input len=%d\n", len); + pass = false; + break; + } + } + free(rand); + + end: + if (pass) { + printf("poly %s\n", pass_str); + } else { + printf("poly %s\n", fail_str); + } + + free(zero); + free(max_bits); +} + +int main(int argc, uint8_t *argv[]) { + extern uint32_t vlmax_u32(); + printf("VLMAX in blocks: %d\n", vlmax_u32()); + FILE* rand = fopen("/dev/urandom", "r"); + test_chachas(rand); + printf("\n"); + test_polys(rand); + fclose(rand); +} diff --git a/thirdparty/rvv-chacha-poly/test.sh b/thirdparty/rvv-chacha-poly/test.sh new file mode 100755 index 0000000..6151fff --- /dev/null +++ b/thirdparty/rvv-chacha-poly/test.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License") ; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dependencies to be installed and on the PATH: +# https://github.com/riscv/riscv-gnu-toolchain +# https://github.com/riscv/riscv-isa-sim +# configure --prefix=$RISCV --with-varch=v512:e64 +# https://github.com/riscv/riscv-pk + +ISA=rv64gcv + +riscv64-unknown-elf-gcc -march=$ISA main.c boring.c vchacha.s vpoly.s -o main -O && + spike --isa=$ISA `which pk` main diff --git a/thirdparty/rvv-chacha-poly/vchacha.s b/thirdparty/rvv-chacha-poly/vchacha.s new file mode 100644 index 0000000..e09696d --- /dev/null +++ b/thirdparty/rvv-chacha-poly/vchacha.s @@ -0,0 +1,267 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License") ; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +.global instruction_counter +.global vector_chacha20 +.global vlmax_u32 + +instruction_counter: + rdinstret a0 + ret + +vlmax_u32: + vsetvli a0, x0, e32, m1, ta, ma + ret + + +# Cell-based implementation strategy: +# v0-v15: Cell vectors. Each element is from a different block + +## Function initialization +# Using the same order as the boring chacha arguments: +# a0 = uint8_t *out +# a1 = uint8_t *in +# a2 = size_t in_len +# a3 = uint8_t key[32] +# a4 = uint8_t nonce[12] +# a5 = uint32_t counter +vector_chacha20: + # a2 = initial length in bytes + # t3 = remaining 64-byte blocks to mix + # t4 = remaining full blocks to read/write + # (if t3 and t4 are different by one, there is a partial block to manually xor) + # t1 = vl in 64-byte blocks + srli t4, a2, 6 + addi t0, a2, 63 + srli t3, t0, 6 +encrypt_blocks: + # initialize vector state + vsetvli t1, t3, e32, m1, ta, ma + # Load 128 bit constant + li t0, 0x61707865 # "expa" little endian + vmv.v.x v0, t0 + li t0, 0x3320646e # "nd 3" little endian + vmv.v.x v1, t0 + li t0, 0x79622d32 # "2-by" little endian + vmv.v.x v2, t0 + li t0, 0x6b206574 # "te k" little endian + vmv.v.x v3, t0 + # Load key + lw t0, 0(a3) + vmv.v.x v4, t0 + lw t0, 4(a3) + vmv.v.x v5, t0 + lw t0, 8(a3) + vmv.v.x v6, t0 + lw t0, 12(a3) + vmv.v.x v7, t0 + lw t0, 16(a3) + vmv.v.x v8, t0 + lw t0, 20(a3) + vmv.v.x v9, t0 + lw t0, 24(a3) + vmv.v.x v10, t0 + lw t0, 28(a3) + vmv.v.x v11, t0 + # Load counter, and increment for each element + vid.v v12 + vadd.vx v12, v12, a5 + # Load nonce + lw t0, 0(a4) + vmv.v.x v13, t0 + lw t0, 4(a4) + vmv.v.x v14, t0 + lw t0, 8(a4) + vmv.v.x v15, t0 + + li t2, 10 # loop counter +round_loop: + + .macro vrotl a i r +#if __riscv_zvbb + vror.vi \a, \a, 32-\i +#else + vsll.vi v16, \a, \i + vsrl.vi \a, \a, 32-\i + vor.vv \a, \a, v16 +#endif + .endm + + .macro quarterround a b c d + # a += b; d ^= a; d <<<= 16; + vadd.vv \a, \a, \b + vxor.vv \d, \d, \a + vrotl \d, 16, t6 + # c += d; b ^= c; b <<<= 12; + vadd.vv \c, \c, \d + vxor.vv \b, \b, \c + vrotl \b, 12, t7 + # a += b; d ^= a; d <<<= 8; + vadd.vv \a, \a, \b + vxor.vv \d, \d, \a + vrotl \d, 8, t8 + # c += d; b ^= c; b <<<= 7; + vadd.vv \c, \c, \d + vxor.vv \b, \b, \c + vrotl \b, 7, t9 + .endm + + # Mix columns. + quarterround v0, v4, v8, v12 + quarterround v1, v5, v9, v13 + quarterround v2, v6, v10, v14 + quarterround v3, v7, v11, v15 + # Mix diagonals. + quarterround v0, v5, v10, v15 + quarterround v1, v6, v11, v12 + quarterround v2, v7, v8, v13 + quarterround v3, v4, v9, v14 + + addi t2, t2, -1 + bnez t2, round_loop + + # Add in initial block values. + # 128 bit constant + li t0, 0x61707865 # "expa" little endian + vadd.vx v0, v0, t0 + li t0, 0x3320646e # "nd 3" little endian + vadd.vx v1, v1, t0 + li t0, 0x79622d32 # "2-by" little endian + vadd.vx v2, v2, t0 + li t0, 0x6b206574 # "te k" little endian + vadd.vx v3, v3, t0 + # Add key + lw t0, 0(a3) + vadd.vx v4, v4, t0 + lw t0, 4(a3) + vadd.vx v5, v5, t0 + lw t0, 8(a3) + vadd.vx v6, v6, t0 + lw t0, 12(a3) + vadd.vx v7, v7, t0 + lw t0, 16(a3) + vadd.vx v8, v8, t0 + lw t0, 20(a3) + vadd.vx v9, v9, t0 + lw t0, 24(a3) + vadd.vx v10, v10, t0 + lw t0, 28(a3) + vadd.vx v11, v11, t0 + # Add counter + vid.v v16 + vadd.vv v12, v12, v16 + vadd.vx v12, v12, a5 + # Load nonce + lw t0, 0(a4) + vadd.vx v13, v13, t0 + lw t0, 4(a4) + vadd.vx v14, v14, t0 + lw t0, 8(a4) + vadd.vx v15, v15, t0 + + # load in vector lanes with two strided segment loads + # in case this is the final block, reset vl to full blocks + vsetvli t5, t4, e32, m1, ta, ma + li t0, 64 + vlsseg8e32.v v16, (a1), t0 + add a1, a1, 32 + vlsseg8e32.v v24, (a1), t0 + add a1, a1, -32 + + # xor in state + vxor.vv v16, v16, v0 + vxor.vv v17, v17, v1 + vxor.vv v18, v18, v2 + vxor.vv v19, v19, v3 + vxor.vv v20, v20, v4 + vxor.vv v21, v21, v5 + vxor.vv v22, v22, v6 + vxor.vv v23, v23, v7 + vxor.vv v24, v24, v8 + vxor.vv v25, v25, v9 + vxor.vv v26, v26, v10 + vxor.vv v27, v27, v11 + vxor.vv v28, v28, v12 + vxor.vv v29, v29, v13 + vxor.vv v30, v30, v14 + vxor.vv v31, v31, v15 + + # write back out with 2 strided segment stores + vssseg8e32.v v16, (a0), t0 + add a0, a0, 32 + vssseg8e32.v v24, (a0), t0 + add a0, a0, -32 + + # update counters/pointers + slli t0, t5, 6 # current VL in bytes + add a0, a0, t0 # advance output pointer + add a1, a1, t0 # advance input pointer + sub a2, a2, t0 # decrement remaining bytes + sub t3, t3, t1 # decrement remaining blocks + sub t4, t4, t1 # decrement remaining blocks + # TODO: crash if counter overflows + add a5, a5, t1 # increment counter + + # loop again if we have remaining blocks + bne x0, t3, encrypt_blocks + + # we're done if there are no more remaining bytes from a partial block + beq zero, a2, return + + # to get the remaining partial block, we transfer the nth element of + # all the state vectors into contiguous stack memory with vsseg, then + # read them with byte-granularity vl + + # reconstruct vl for all computed blocks + add t0, t3, t1 + vsetvli t0, t0, e32, m1, ta, ma + add t0, t0, -1 + + #vse.v v4, (a0) + #j return + + # use a masked vsseg instead of sliding everything down? + # both options seem like they might touch a lot of vector state... + vslidedown.vx v16, v0, t0 + vslidedown.vx v17, v1, t0 + vslidedown.vx v18, v2, t0 + vslidedown.vx v19, v3, t0 + vslidedown.vx v20, v4, t0 + vslidedown.vx v21, v5, t0 + vslidedown.vx v22, v6, t0 + vslidedown.vx v23, v7, t0 + vslidedown.vx v24, v8, t0 + vslidedown.vx v25, v9, t0 + vslidedown.vx v26, v10, t0 + vslidedown.vx v27, v11, t0 + vslidedown.vx v28, v12, t0 + vslidedown.vx v29, v13, t0 + vslidedown.vx v30, v14, t0 + vslidedown.vx v31, v15, t0 + li t0, 1 + vsetvli zero, t0, e32, m1, ta, ma + addi t0, sp, -64 + addi t1, sp, -32 + vsseg8e32.v v16, (t0) + vsseg8e32.v v24, (t1) + + vsetvli a2, a2, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (t0) + vxor.vv v0, v0, v8 + vse8.v v0, (a0) + + +return: + ret diff --git a/thirdparty/rvv-chacha-poly/vpoly.s b/thirdparty/rvv-chacha-poly/vpoly.s new file mode 100644 index 0000000..a446b34 --- /dev/null +++ b/thirdparty/rvv-chacha-poly/vpoly.s @@ -0,0 +1,497 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License") ; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +.global vector_poly1305 +# poly1305 +# Based on the obvious SIMD algorithm, described as Goll-Gueron here: +# https://eprint.iacr.org/2019/842.pdf +# Assumes VLEN is a power of 2, and that intermediate vsetvl will always return the max. +# Hash is defined simply, for 32-byte key split between 16-byte s and r: +# s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5 +# Performant implementations represent 130 bit numbers as 5 26-bit numbers. +# Precomputation step: +# Compute vector [r, r², r³, r⁴, ...] ( 5 32-bit vectors) +# Compute scalar r^VLMAX (5 32-bit registers) +# This can be done in 2*log2(VLMAX) multiplications: +# i = 1; m = r; v = r +# while i < VLMAX: +# v *= m (masking out the last i elements) +# m *= m +# i <<= 1 +# Vector loop: +# load segment (from the end) into 4 32-bit vectors +# spread into standard 5 32-bit vector format +# vector multiply into polynomial vector +# vector add into sum so far +# vector-scalar multiply polynomial vector with r^VLMAX +# Extract: +# vector sum reduce polynomial vector into scalar +# add to s +# extract 16-byte hash + +# TODO: implement this with C intrinsics in rvv_vector.h, as register +# allocation is actually annoying. + +# Generic 130-bit multiply/mod code +# Reads 5-limbed inputs from a and b, writes result to a +# Uses 5 e64,m2 d registers for accumulation +.macro vec_mul130 x a0 a1 a2 a3 a4 b0 b1 b2 b3 b4 b51 b52 b53 b54 d0 d1 d2 d3 d4 carry tmp v mask="" + # Helpful diagram from http://loup-vaillant.fr/tutorials/poly1305-design + # a4 a3 a2 a1 a0 + # × b4 b3 b2 b1 b0 + # --------------------------------------- + # a4×b0 a3×b0 a2×b0 a1×b0 a0×b0 + # + a3×b1 a2×b1 a1×b1 a0×b1 5×a4×b1 + # + a2×b2 a1×b2 a0×b2 5×a4×b2 5×a3×b2 + # + a1×b3 a0×b3 5×a4×b3 5×a3×b3 5×a2×b3 + # + a0×b4 5×a4×b4 5×a3×b4 5×a2×b4 5×a1×b4 + # --------------------------------------- + # d4 d3 d2 d1 d0 + + # would it be more/less performant to do this by rows instead of columns? + # vectors pipelining without requiring stalls etc + # d0 column + vwmulu.\v \d0, \a0, \b0 \mask + vwmaccu.\v \d0, \b51, \a4 \mask + vwmaccu.\v \d0, \b52, \a3 \mask + vwmaccu.\v \d0, \b53, \a2 \mask + vwmaccu.\v \d0, \b54, \a1 \mask + + # d1 column + vwmulu.\v \d1, \a1, \b0 \mask + vwmaccu.\v \d1, \b1, \a0 \mask + vwmaccu.\v \d1, \b52, \a4 \mask + vwmaccu.\v \d1, \b53, \a3 \mask + vwmaccu.\v \d1, \b54, \a2 \mask + + # d2 column + vwmulu.\v \d2, \a2, \b0 \mask + vwmaccu.\v \d2, \b1, \a1 \mask + vwmaccu.\v \d2, \b2, \a0 \mask + vwmaccu.\v \d2, \b53, \a4 \mask + vwmaccu.\v \d2, \b54, \a3 \mask + + # d3 column + vwmulu.\v \d3, \a3, \b0 \mask + vwmaccu.\v \d3, \b1, \a2 \mask + vwmaccu.\v \d3, \b2, \a1 \mask + vwmaccu.\v \d3, \b3, \a0 \mask + vwmaccu.\v \d3, \b54, \a4 \mask + + # d4 column + vwmulu.\v \d4, \a4, \b0 \mask + vwmaccu.\v \d4, \b1, \a3 \mask + vwmaccu.\v \d4, \b2, \a2 \mask + vwmaccu.\v \d4, \b3, \a1 \mask + vwmaccu.\v \d4, \b4, \a0 \mask + + # Carry propagation + # logic copied from https://github.com/floodyberry/poly1305-donna + li t0, 0x3ffffff + .macro carry_prop\x a d + vwaddu.wv \d, \d, \carry \mask + vnsrl.wi \carry, \d, 26 \mask + vnsrl.wi \a, \d, 0 \mask + vand.vx \a, \a, t0 \mask + .endm + + vmv.v.i \carry, 0 + carry_prop\x \a0, \d0 + carry_prop\x \a1, \d1 + carry_prop\x \a2, \d2 + carry_prop\x \a3, \d3 + carry_prop\x \a4, \d4 + + # wraparound carry continue + vsll.vi \tmp, \carry, 2 \mask + vadd.vv \a0, \a0, \tmp \mask + vadd.vv \a0, \a0, \carry \mask + # boring stops carrying here, but that fails random tests + vsrl.vi \carry, \a0, 26 \mask + vand.vx \a0, \a0, t0 \mask + vadd.vv \a1, \a1, \carry \mask + + .endm + +# Scalar 130-bit a0-4 = a0-4 * a0-4 +.macro scalar_mul130 x a0 a1 a2 a3 a4 a51 a52 a53 a54 d0 d1 d2 d3 d4 carry tmp + # d0 column + mul \d0, \a1, \a54 + mul \tmp, \a2, \a53 + add \d0, \d0, \tmp + slli \d0, \d0, 1 + mul \tmp, \a0, \a0 + add \d0, \d0, \tmp + + # d1 column + mul \d1, \a1, \a0 + mul \tmp, \a2, \a54 + add \d1, \d1, \tmp + slli \d1, \d1, 1 + mul \tmp, \a53, \a3 + add \d1, \d1, \tmp + + # d2 column + mul \d2, \a2, \a0 + mul \tmp, \a53, \a4 + add \d2, \d2, \tmp + slli \d2, \d2, 1 + mul \tmp, \a1, \a1 + add \d2, \d2, \tmp + + # d3 column + mul \d3, \a3, \a0 + mul \tmp, \a1, \a2 + add \d3, \d3, \tmp + slli \d3, \d3, 1 + mul \tmp, \a54, \a4 + add \d3, \d3, \tmp + + # d4 column + mul \d4, \a4, \a0 + mul \tmp, \a1, \a3 + add \d4, \d4, \tmp + slli \d4, \d4, 1 + mul \tmp, \a2, \a2 + add \d4, \d4, \tmp + + # Carry propagation + # logic copied from https://github.com/floodyberry/poly1305-donna + li \tmp, 0x3ffffff + .macro carry_prop_scalar\x a d + add \d, \d, \carry + srli \carry, \d, 26 + and \a, \d, \tmp + .endm + + li \carry, 0 + carry_prop_scalar\x \a0, \d0 + carry_prop_scalar\x \a1, \d1 + carry_prop_scalar\x \a2, \d2 + carry_prop_scalar\x \a3, \d3 + carry_prop_scalar\x \a4, \d4 + + # wraparound carry continue + slli \tmp, \carry, 2 + add \a0, \a0, \tmp + add \a0, \a0, \carry + # carry as much as the other mul code + srli \carry, \a0, 26 + li \tmp, 0x3ffffff + and \a0, \a0, \tmp + add \a1, \a1, \carry + + # Store a*5 registers for next time + slli \a51, \a1, 2 + add \a51, \a51, \a1 + slli \a52, \a2, 2 + add \a52, \a52, \a2 + slli \a53, \a3, 2 + add \a53, \a53, \a3 + slli \a54, \a4, 2 + add \a54, \a54, \a4 + + .endm + +# Argument mappings +# a0: const uint8_t* in +# a1: size_t len +# a2: const uint8_t[32] key +# a3: uint8_t[16] sig +# Register mappings (https://en.wikichip.org/wiki/risc-v/registers) +# r^vlmax: s0, s1, s2, s3, s4 +# [r^vlmax, r^(vlmax-1), ... r^2, r]: v6, v7, v8, v9, v10 +# current accumulated vector state: v1, v2, v3, v4, v5 +vector_poly1305: + # save registers + sd s0, -8(sp) + sd s1, -16(sp) + sd s2, -24(sp) + sd s3, -32(sp) + sd s4, -40(sp) + sd s5, -48(sp) + sd s6, -56(sp) + sd s7, -64(sp) + sd s8, -72(sp) + sd s9, -80(sp) + sd s11, -88(sp) + + # make sure input is a multiple of blocksize + andi t0, a1, 0xf + beq t0, zero, continue + li t0, 0x3713 # magic error number + sw t0, (a3) + j return +continue: + + # load R and spread to 5 26-bit limbs: s0-4 + ld t0, 0(a2) + ld t1, 8(a2) + li t5, 0x0ffffffc0fffffff + and t0, t0, t5 + li t5, 0x0ffffffc0ffffffc + and t1, t1, t5 + li t5, 0x3ffffff + and s0, t0, t5 + srli s1, t0, 26 + and s1, s1, t5 + srli s2, t0, 52 + slli t0, t1, 12 + or s2, s2, t0 + and s2, s2, t5 + srli s3, t1, 14 + and s3, s3, t5 + srli s4, t1, 40 + + # pre-multiplied-by-5 scalars + slli t4, s3, 2 + add t4, t4, s3 + slli t5, s4, 2 + add t5, t5, s4 + + # a5 is vlmax-1 for e32m1 + li t0, -1 + vsetvli a5, t0, e32, m1, ta, mu + addi a5, a5, -1 # vlmax-1 + # initialize vector to r^1 + vmv.v.x v6, s0 + vmv.v.x v7, s1 + vmv.v.x v8, s2 + vmv.v.x v9, s3 + vmv.v.x v10, s4 + + # Do first iteration manually, as we can masked set r^2 instead of doing a second multiplication + # a4 is current exp + li a4, 1 + # set alternating mask pattern + vid.v v1 + vrsub.vx v1, v1, a5 + vand.vx v1, v1, a4 + vmseq.vx v0, v1, a4 + slli a4, a4, 1 + + # scalar-scalar 130bit mul: s0-4 = s0-4 * s0-4 + scalar_mul130 1 s0 s1 s2 s3 s4 t2 t3 t4 t5 s5 s6 s7 s8 s9 t0 t1 + + vmv.v.i v11, 0 # no vmv with mask, so vor with 0 + vor.vx v6, v11, s0, v0.t + vor.vx v7, v11, s1, v0.t + vor.vx v8, v11, s2, v0.t + vor.vx v9, v11, s3, v0.t + vor.vx v10, v11, s4, v0.t + +precomp: + # compute mask (v0) + # exp-1: 7,6,5,4,3,2,1,0 (a5) + # r^1: 1,0,1,0,1,0,1,0 + # r^2: 1,1,0,0,1,1,0,0 + # r^4: 1,1,1,1,0,0,0,0 + vid.v v1 + vrsub.vx v1, v1, a5 + vand.vx v1, v1, a4 + vmseq.vx v0, v1, a4 + + # vector-scalar masked 130bit mul: v6-10 = v6-10 * s0-4 + vec_mul130 vxm v6 v7 v8 v9 v10 s0 s1 s2 s3 s4 t2 t3 t4 t5 v12 v14 v16 v18 v20 v11 v22 vx ",v0.t" + + # scalar-scalar 130bit mul: s0-4 = s0-4 * s0-4 + scalar_mul130 2 s0 s1 s2 s3 s4 t2 t3 t4 t5 s5 s6 s7 s8 s9 t0 t1 + + # end of precomp loop: + slli a4, a4, 1 # double exponent + blt a4, a5, precomp + + # store post-precomputation instruction counter + rdinstret s11 + + + # From v11-14, separate out into 5 26-bit limbs: v20-v24 + .macro vec_split5 + li t0, 0x3ffffff + vand.vx v20, v11, t0 + vsrl.vi v11, v11, 26 + vsll.vi v31, v12, 6 + vor.vv v11, v11, v31 + vand.vx v21, v11, t0 + vsrl.vi v12, v12, 20 + vsll.vi v31, v13, 12 + vor.vv v12, v12, v31 + vand.vx v22, v12, t0 + vsrl.vi v13, v13, 14 + vsll.vi v31, v14, 18 + vor.vv v13, v13, v31 + vand.vx v23, v13, t0 + vsrl.vi v24, v14, 8 + .endm + + # set up state as initial leading zero step + vmv.v.i v1, 0 + vmv.v.i v2, 0 + vmv.v.i v3, 0 + vmv.v.i v4, 0 + vmv.v.i v5, 0 + # a1: bytes left + # a4: blocks left + srli a4, a1, 4 + # t1: blocks in initial step + # use a full vector here, if blocks are a multiple of vector size + addi a4, a4, -1 + and t1, a4, a5 + addi a4, a4, 1 + addi t1, t1, 1 + + vsetvli t1, t1, e32, m1, ta, ma + vlseg4e32.v v11, (a0) + # increment pointer + slli t0, t1, 4 + add a0, a0, t0 + sub a1, a1, t0 + vec_split5 + # add leading bit + # TODO: don't run vector version if we can't even fill the first vector + li t0, 1<<24 + vor.vx v24, v24, t0 + + li t0, -1 + vsetvli a5, t0, e32, m1, ta, ma + sub t0, a5, t1 + slli a5, a5, 4 # block size in bytes + vslideup.vx v1, v20, t0 + vslideup.vx v2, v21, t0 + vslideup.vx v3, v22, t0 + vslideup.vx v4, v23, t0 + vslideup.vx v5, v24, t0 + + +vector_loop: + beq a1, zero, end_vector_loop + + # multiply by r^vlmax + vec_mul130 vx v1 v2 v3 v4 v5 s0 s1 s2 s3 s4 t2 t3 t4 t5 v12 v14 v16 v18 v20 v11 v22 vx + + # load in new data: v11-v14 + vlseg4e32.v v11, (a0) + add a0, a0, a5 + sub a1, a1, a5 + vec_split5 + # add leading bit + # TODO: support final non-full block correctly + li t0, 1<<24 + vor.vx v24, v24, t0 + + # add into state + vadd.vv v1, v1, v20 + vadd.vv v2, v2, v21 + vadd.vv v3, v3, v22 + vadd.vv v4, v4, v23 + vadd.vv v5, v5, v24 + + j vector_loop +end_vector_loop: + + # multiply in [r^vlmax, r^(vlmax-1),... r^2, r] + vsll.vi v27, v7, 2 + vadd.vv v27, v27, v7 + vsll.vi v28, v8, 2 + vadd.vv v28, v28, v8 + vsll.vi v29, v9, 2 + vadd.vv v29, v29, v9 + vsll.vi v30, v10, 2 + vadd.vv v30, v30, v10 + vec_mul130 vv v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v27 v28 v29 v30 v12 v14 v16 v18 v20 v11 v22 vv + + # vector reduction, into widened sum in case vector is huge + vmv.v.i v6, 0 + vmv.v.i v7, 0 + vmv.v.i v8, 0 + vmv.v.i v9, 0 + vmv.v.i v10, 0 + vwredsum.vs v6, v1, v6 + vwredsum.vs v7, v2, v7 + vwredsum.vs v8, v3, v8 + vwredsum.vs v9, v4, v9 + vwredsum.vs v10, v5, v10 + # extract to scalars + li t0, 1 + vsetvli zero, t0, e64, m1, ta, ma + vmv.x.s s0, v6 + vmv.x.s s1, v7 + vmv.x.s s2, v8 + vmv.x.s s3, v9 + vmv.x.s s4, v10 + + # carry through + # t0=carry t1=mask + li t0, 0 + li t1, 0x3ffffff + .macro carry_scalar s + add \s, \s, t0 + srli t0, \s, 26 + and \s, \s, t1 + .endm + + carry_scalar s0 + carry_scalar s1 + carry_scalar s2 + carry_scalar s3 + carry_scalar s4 + # carry *= 5 + slli t2, t0, 2 + add t0, t0, t2 + carry_scalar s0 + carry_scalar s1 + carry_scalar s2 + carry_scalar s3 + carry_scalar s4 + # any remaining stuff to carry has to be in the 2 bits we don't care about, right? + bne t0, zero, return + + # collapse into contiguous 128 bits (s0,s2) + slli t0, s1, 26 + or s0, s0, t0 + slli t0, s2, 52 + or s0, s0, t0 + srli s2, s2, 12 + slli t0, s3, 14 + or s2, s2, t0 + slli t0, s4, 40 + or s2, s2, t0 + + # add in other half of key (after the carry it seems) + ld t0, 16(a2) + ld t1, 24(a2) + add s0, s0, t0 + sltu t0, s0, t0 + add s2, s2, t0 + add s2, s2, t1 + + # write final signature + sd s0, 0(a3) + sd s2, 8(a3) + +return: + # restore registers + mv a0, s11 + ld s0, -8(sp) + ld s1, -16(sp) + ld s2, -24(sp) + ld s3, -32(sp) + ld s4, -40(sp) + ld s5, -48(sp) + ld s6, -56(sp) + ld s7, -64(sp) + ld s8, -72(sp) + ld s9, -80(sp) + ld s11, -88(sp) + ret