diff --git a/.gitmodules b/.gitmodules
index 9addb59..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "bench/rvv-chacha-poly"]
- path = bench/rvv-chacha-poly
- url = https://github.com/edre/rvv-chacha-poly
diff --git a/bench/chacha20.S b/bench/chacha20.S
index 1e84dca..038b954 100644
--- a/bench/chacha20.S
+++ b/bench/chacha20.S
@@ -1,5 +1,5 @@
#ifndef MX
#if __riscv_xlen != 32
-#include "rvv-chacha-poly/vchacha.s"
+#include "../thirdparty/rvv-chacha-poly/vchacha.s"
#endif
#endif
diff --git a/bench/chacha20.c b/bench/chacha20.c
index 00ccf6e..ea5cb3c 100644
--- a/bench/chacha20.c
+++ b/bench/chacha20.c
@@ -1,6 +1,6 @@
#include "bench.h"
#if __riscv_xlen != 32
-#include "../thirdparty/boring.h"
+#include "../thirdparty/rvv-chacha-poly/boring.h"
uint8_t *dest, *src;
uint8_t key[32], nonce[12];
@@ -52,7 +52,7 @@ Bench benches[] = {
}; BENCH_MAIN(benches)
-#include "../thirdparty/boring.c"
+#include "../thirdparty/rvv-chacha-poly/boring.c"
#else
void init(void) {}
Impl impls[] = {};
diff --git a/bench/poly1305.S b/bench/poly1305.S
index 8658971..3e8ac2d 100644
--- a/bench/poly1305.S
+++ b/bench/poly1305.S
@@ -1,5 +1,5 @@
#ifndef MX
#if __riscv_xlen != 32
-#include "rvv-chacha-poly/vpoly.s"
+#include "../thirdparty/rvv-chacha-poly/vpoly.s"
#endif
#endif
diff --git a/bench/poly1305.c b/bench/poly1305.c
index ef8fb1c..4a56c58 100644
--- a/bench/poly1305.c
+++ b/bench/poly1305.c
@@ -1,6 +1,6 @@
#include "bench.h"
#if __riscv_xlen != 32
-#include "../thirdparty/boring.h"
+#include "../thirdparty/rvv-chacha-poly/boring.h"
uint8_t *src;
uint8_t key[32], sig[16];
@@ -55,7 +55,7 @@ Bench benches[] = {
}; BENCH_MAIN(benches)
-#include "../thirdparty/boring.c"
+#include "../thirdparty/rvv-chacha-poly/boring.c"
#else
void init(void) {}
Impl impls[] = {};
diff --git a/bench/rvv-chacha-poly b/bench/rvv-chacha-poly
deleted file mode 160000
index 7cffc88..0000000
--- a/bench/rvv-chacha-poly
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7cffc882d35f36a355e83b35d9815f86e0a5598d
diff --git a/thirdparty/rvv-chacha-poly/CONTRIBUTING.md b/thirdparty/rvv-chacha-poly/CONTRIBUTING.md
new file mode 100644
index 0000000..a121ba3
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+# How to Contribute
+
+I can accept your patches and contributions to this project with the
+following caveats from my employer:
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Community Guidelines
+
+Treat people with respect.
diff --git a/thirdparty/rvv-chacha-poly/LICENSE b/thirdparty/rvv-chacha-poly/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/thirdparty/rvv-chacha-poly/README.md b/thirdparty/rvv-chacha-poly/README.md
new file mode 100644
index 0000000..d04a90e
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/README.md
@@ -0,0 +1,15 @@
+NOTE: code from https://github.com/edre/rvv-chacha-poly
+
+RISC-V vector extension implementation of chacha20 and poly1305
+cryptographic primitives.
+
+Chacha20 and poly1305 are simple to vectorize without specialized
+instructions. This project implements them in assembly, and verifies them
+against the BoringSSL C implementation. As expected the executed instruction
+count go down a lot, but I don't have real hardware to see if the runtime does
+too.
+
+This is not an officially supported Google product.
+
+This is a proof of concept crypto library. Those words should sound very scary
+together. Don't use this.
diff --git a/thirdparty/boring.c b/thirdparty/rvv-chacha-poly/boring.c
similarity index 100%
rename from thirdparty/boring.c
rename to thirdparty/rvv-chacha-poly/boring.c
diff --git a/thirdparty/boring.h b/thirdparty/rvv-chacha-poly/boring.h
similarity index 100%
rename from thirdparty/boring.h
rename to thirdparty/rvv-chacha-poly/boring.h
diff --git a/thirdparty/rvv-chacha-poly/main.c b/thirdparty/rvv-chacha-poly/main.c
new file mode 100644
index 0000000..265a9b8
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/main.c
@@ -0,0 +1,195 @@
+/* Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License") ;
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include
+#include
+#include
+#include
+#include "boring.h"
+
+void println_hex(uint8_t* data, int size) {
+ while (size > 0) {
+ printf("%02x", *data);
+ data++;
+ size--;
+ }
+ printf("\n");
+}
+
+// TODO: test the vector doesn't write past the end
+// test function with multiple length inputs (optional printing)
+// test non-block sized lengths
+
+extern uint64_t instruction_counter();
+
+const char* pass_str = "\x1b[32mPASS\x1b[0m";
+const char* fail_str = "\x1b[31mFAIL\x1b[0m";
+
+bool test_chacha(const uint8_t* data, size_t len, const uint8_t key[32], const uint8_t nonce[12], bool verbose) {
+ extern void vector_chacha20(uint8_t *out, const uint8_t *in,
+ size_t in_len, const uint8_t key[32],
+ const uint8_t nonce[12], uint32_t counter);
+ uint8_t* golden = malloc(len);
+ memset(golden, 0, len);
+ uint64_t start = instruction_counter();
+ boring_chacha20(golden, data, len, key, nonce, 0);
+ uint64_t end = instruction_counter();
+ uint64_t boring_count = end - start;
+
+ uint8_t* vector = malloc(len + 4);
+ memset(vector, 0, len+4);
+ start = instruction_counter();
+ vector_chacha20(vector, data, len, key, nonce, 0);
+ end = instruction_counter();
+
+ bool pass = memcmp(golden, vector, len) == 0;
+
+ if (verbose || !pass) {
+ printf("golden: ");
+ println_hex(golden, 32);
+ printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len);
+ printf("vector: ");
+ println_hex(vector, 32);
+ printf("inst_count=%d, inst/byte=%.02f\n", end - start, (float)(end - start)/len);
+ }
+
+ uint32_t past_end = vector[len];
+ if (past_end != 0) {
+ printf("vector wrote past end %08x\n", past_end);
+ pass = false;
+ }
+
+ free(golden);
+ free(vector);
+
+ return pass;
+}
+
+void test_chachas(FILE* f) {
+ int len = 1024 - 11;
+ uint8_t* data = malloc(len);
+ uint32_t rand = 1;
+ for (int i = 0; i < len; i++) {
+ rand *= 101;
+ rand %= 16777213; // random prime
+ data[i] = (uint8_t)(rand);
+ }
+ uint8_t key[32] = "Setec astronomy;too many secrets";
+ uint8_t nonce[12] = "BurnAfterUse";
+ int counter = 0;
+
+ bool pass = test_chacha(data, len, key, nonce, true);
+
+ if (pass) {
+ for (int i = 1, len = 1; len < 1000; len += i++) {
+ fread(key, 32, 1, f);
+ fread(nonce, 12, 1, f);
+ if (!test_chacha(data, len, key, nonce, false)) {
+ printf("Failed with len=%d\n", len);
+ pass = false;
+ break;
+ }
+ }
+ }
+
+ if (pass) {
+ printf("chacha %s\n", pass_str);
+ } else {
+ printf("chacha %s\n", fail_str);
+ }
+}
+
+bool test_poly(const uint8_t* data, size_t len, const uint8_t key[32], bool verbose) {
+ extern uint64_t vector_poly1305(const uint8_t* in, size_t len,
+ const uint8_t key[32], uint8_t sig[16]);
+
+ poly1305_state state;
+ uint8_t *sig = malloc(16); // gets corrupted if I define it on the stack?
+ uint64_t start = instruction_counter();
+ boring_poly1305_init(&state, key);
+ boring_poly1305_update(&state, data, len);
+ boring_poly1305_finish(&state, sig);
+ uint64_t end = instruction_counter();
+ uint64_t boring_count = end - start;
+
+ uint8_t *sig2 = malloc(16);
+ start = instruction_counter();
+ uint64_t mid = vector_poly1305(data, len, key, sig2);
+ end = instruction_counter();
+
+ bool pass = memcmp(sig, sig2, 16) == 0;
+
+ if (verbose || !pass) {
+ printf("boring mac: ");
+ println_hex(sig, 16);
+ printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len);
+ printf("vector mac: ");
+ println_hex(sig2, 16);
+ printf("precomputation=%d, processing=%d, inst/byte=%.02f\n",
+ mid - start, end - mid, (float)(end - mid)/len);
+ }
+
+ free(sig);
+ free(sig2);
+ return pass;
+}
+
+void test_polys(FILE* f) {
+ const int big_len = 1024;
+ uint8_t *zero = malloc(2000);
+ uint8_t *max_bits = malloc(big_len);
+ memset(max_bits, 0xff, big_len);
+ const uint8_t one[32] = {1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ const uint8_t key[32] = {1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 255,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ const uint8_t data[272] = "Setec astronomy;too many secrets";
+ bool pass = test_poly(max_bits, big_len, max_bits, true);
+
+ if (!pass)
+ goto end;
+
+ // random test
+ const int max_len = 1000;
+ uint8_t *rand = malloc(max_len);
+ for (int len = 16; len <= max_len; len += 16) {
+ fread((uint8_t*)key, 32, 1, f);
+ fread((uint8_t*)rand, len, 1, f);
+ if (!test_poly(data, len, key, false)) {
+ printf("failed random input len=%d\n", len);
+ pass = false;
+ break;
+ }
+ }
+ free(rand);
+
+ end:
+ if (pass) {
+ printf("poly %s\n", pass_str);
+ } else {
+ printf("poly %s\n", fail_str);
+ }
+
+ free(zero);
+ free(max_bits);
+}
+
+int main(int argc, uint8_t *argv[]) {
+ extern uint32_t vlmax_u32();
+ printf("VLMAX in blocks: %d\n", vlmax_u32());
+ FILE* rand = fopen("/dev/urandom", "r");
+ test_chachas(rand);
+ printf("\n");
+ test_polys(rand);
+ fclose(rand);
+}
diff --git a/thirdparty/rvv-chacha-poly/test.sh b/thirdparty/rvv-chacha-poly/test.sh
new file mode 100755
index 0000000..6151fff
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/test.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License") ;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Dependencies to be installed and on the PATH:
+# https://github.com/riscv/riscv-gnu-toolchain
+# https://github.com/riscv/riscv-isa-sim
+# configure --prefix=$RISCV --with-varch=v512:e64
+# https://github.com/riscv/riscv-pk
+
+ISA=rv64gcv
+
+riscv64-unknown-elf-gcc -march=$ISA main.c boring.c vchacha.s vpoly.s -o main -O &&
+ spike --isa=$ISA `which pk` main
diff --git a/thirdparty/rvv-chacha-poly/vchacha.s b/thirdparty/rvv-chacha-poly/vchacha.s
new file mode 100644
index 0000000..e09696d
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/vchacha.s
@@ -0,0 +1,267 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License") ;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.global instruction_counter
+.global vector_chacha20
+.global vlmax_u32
+
+instruction_counter:
+ rdinstret a0
+ ret
+
+vlmax_u32:
+ vsetvli a0, x0, e32, m1, ta, ma
+ ret
+
+
+# Cell-based implementation strategy:
+# v0-v15: Cell vectors. Each element is from a different block
+
+## Function initialization
+# Using the same order as the boring chacha arguments:
+# a0 = uint8_t *out
+# a1 = uint8_t *in
+# a2 = size_t in_len
+# a3 = uint8_t key[32]
+# a4 = uint8_t nonce[12]
+# a5 = uint32_t counter
+vector_chacha20:
+ # a2 = initial length in bytes
+ # t3 = remaining 64-byte blocks to mix
+ # t4 = remaining full blocks to read/write
+ # (if t3 and t4 are different by one, there is a partial block to manually xor)
+ # t1 = vl in 64-byte blocks
+ srli t4, a2, 6
+ addi t0, a2, 63
+ srli t3, t0, 6
+encrypt_blocks:
+ # initialize vector state
+ vsetvli t1, t3, e32, m1, ta, ma
+ # Load 128 bit constant
+ li t0, 0x61707865 # "expa" little endian
+ vmv.v.x v0, t0
+ li t0, 0x3320646e # "nd 3" little endian
+ vmv.v.x v1, t0
+ li t0, 0x79622d32 # "2-by" little endian
+ vmv.v.x v2, t0
+ li t0, 0x6b206574 # "te k" little endian
+ vmv.v.x v3, t0
+ # Load key
+ lw t0, 0(a3)
+ vmv.v.x v4, t0
+ lw t0, 4(a3)
+ vmv.v.x v5, t0
+ lw t0, 8(a3)
+ vmv.v.x v6, t0
+ lw t0, 12(a3)
+ vmv.v.x v7, t0
+ lw t0, 16(a3)
+ vmv.v.x v8, t0
+ lw t0, 20(a3)
+ vmv.v.x v9, t0
+ lw t0, 24(a3)
+ vmv.v.x v10, t0
+ lw t0, 28(a3)
+ vmv.v.x v11, t0
+ # Load counter, and increment for each element
+ vid.v v12
+ vadd.vx v12, v12, a5
+ # Load nonce
+ lw t0, 0(a4)
+ vmv.v.x v13, t0
+ lw t0, 4(a4)
+ vmv.v.x v14, t0
+ lw t0, 8(a4)
+ vmv.v.x v15, t0
+
+ li t2, 10 # loop counter
+round_loop:
+
+ .macro vrotl a i r
+#if __riscv_zvbb
+ vror.vi \a, \a, 32-\i
+#else
+ vsll.vi v16, \a, \i
+ vsrl.vi \a, \a, 32-\i
+ vor.vv \a, \a, v16
+#endif
+ .endm
+
+ .macro quarterround a b c d
+ # a += b; d ^= a; d <<<= 16;
+ vadd.vv \a, \a, \b
+ vxor.vv \d, \d, \a
+ vrotl \d, 16, t6
+ # c += d; b ^= c; b <<<= 12;
+ vadd.vv \c, \c, \d
+ vxor.vv \b, \b, \c
+ vrotl \b, 12, t7
+ # a += b; d ^= a; d <<<= 8;
+ vadd.vv \a, \a, \b
+ vxor.vv \d, \d, \a
+ vrotl \d, 8, t8
+ # c += d; b ^= c; b <<<= 7;
+ vadd.vv \c, \c, \d
+ vxor.vv \b, \b, \c
+ vrotl \b, 7, t9
+ .endm
+
+ # Mix columns.
+ quarterround v0, v4, v8, v12
+ quarterround v1, v5, v9, v13
+ quarterround v2, v6, v10, v14
+ quarterround v3, v7, v11, v15
+ # Mix diagonals.
+ quarterround v0, v5, v10, v15
+ quarterround v1, v6, v11, v12
+ quarterround v2, v7, v8, v13
+ quarterround v3, v4, v9, v14
+
+ addi t2, t2, -1
+ bnez t2, round_loop
+
+ # Add in initial block values.
+ # 128 bit constant
+ li t0, 0x61707865 # "expa" little endian
+ vadd.vx v0, v0, t0
+ li t0, 0x3320646e # "nd 3" little endian
+ vadd.vx v1, v1, t0
+ li t0, 0x79622d32 # "2-by" little endian
+ vadd.vx v2, v2, t0
+ li t0, 0x6b206574 # "te k" little endian
+ vadd.vx v3, v3, t0
+ # Add key
+ lw t0, 0(a3)
+ vadd.vx v4, v4, t0
+ lw t0, 4(a3)
+ vadd.vx v5, v5, t0
+ lw t0, 8(a3)
+ vadd.vx v6, v6, t0
+ lw t0, 12(a3)
+ vadd.vx v7, v7, t0
+ lw t0, 16(a3)
+ vadd.vx v8, v8, t0
+ lw t0, 20(a3)
+ vadd.vx v9, v9, t0
+ lw t0, 24(a3)
+ vadd.vx v10, v10, t0
+ lw t0, 28(a3)
+ vadd.vx v11, v11, t0
+ # Add counter
+ vid.v v16
+ vadd.vv v12, v12, v16
+ vadd.vx v12, v12, a5
+ # Load nonce
+ lw t0, 0(a4)
+ vadd.vx v13, v13, t0
+ lw t0, 4(a4)
+ vadd.vx v14, v14, t0
+ lw t0, 8(a4)
+ vadd.vx v15, v15, t0
+
+ # load in vector lanes with two strided segment loads
+ # in case this is the final block, reset vl to full blocks
+ vsetvli t5, t4, e32, m1, ta, ma
+ li t0, 64
+ vlsseg8e32.v v16, (a1), t0
+ add a1, a1, 32
+ vlsseg8e32.v v24, (a1), t0
+ add a1, a1, -32
+
+ # xor in state
+ vxor.vv v16, v16, v0
+ vxor.vv v17, v17, v1
+ vxor.vv v18, v18, v2
+ vxor.vv v19, v19, v3
+ vxor.vv v20, v20, v4
+ vxor.vv v21, v21, v5
+ vxor.vv v22, v22, v6
+ vxor.vv v23, v23, v7
+ vxor.vv v24, v24, v8
+ vxor.vv v25, v25, v9
+ vxor.vv v26, v26, v10
+ vxor.vv v27, v27, v11
+ vxor.vv v28, v28, v12
+ vxor.vv v29, v29, v13
+ vxor.vv v30, v30, v14
+ vxor.vv v31, v31, v15
+
+ # write back out with 2 strided segment stores
+ vssseg8e32.v v16, (a0), t0
+ add a0, a0, 32
+ vssseg8e32.v v24, (a0), t0
+ add a0, a0, -32
+
+ # update counters/pointers
+ slli t0, t5, 6 # current VL in bytes
+ add a0, a0, t0 # advance output pointer
+ add a1, a1, t0 # advance input pointer
+ sub a2, a2, t0 # decrement remaining bytes
+ sub t3, t3, t1 # decrement remaining blocks
+ sub t4, t4, t1 # decrement remaining blocks
+ # TODO: crash if counter overflows
+ add a5, a5, t1 # increment counter
+
+ # loop again if we have remaining blocks
+ bne x0, t3, encrypt_blocks
+
+ # we're done if there are no more remaining bytes from a partial block
+ beq zero, a2, return
+
+ # to get the remaining partial block, we transfer the nth element of
+ # all the state vectors into contiguous stack memory with vsseg, then
+ # read them with byte-granularity vl
+
+ # reconstruct vl for all computed blocks
+ add t0, t3, t1
+ vsetvli t0, t0, e32, m1, ta, ma
+ add t0, t0, -1
+
+ #vse.v v4, (a0)
+ #j return
+
+ # use a masked vsseg instead of sliding everything down?
+ # both options seem like they might touch a lot of vector state...
+ vslidedown.vx v16, v0, t0
+ vslidedown.vx v17, v1, t0
+ vslidedown.vx v18, v2, t0
+ vslidedown.vx v19, v3, t0
+ vslidedown.vx v20, v4, t0
+ vslidedown.vx v21, v5, t0
+ vslidedown.vx v22, v6, t0
+ vslidedown.vx v23, v7, t0
+ vslidedown.vx v24, v8, t0
+ vslidedown.vx v25, v9, t0
+ vslidedown.vx v26, v10, t0
+ vslidedown.vx v27, v11, t0
+ vslidedown.vx v28, v12, t0
+ vslidedown.vx v29, v13, t0
+ vslidedown.vx v30, v14, t0
+ vslidedown.vx v31, v15, t0
+ li t0, 1
+ vsetvli zero, t0, e32, m1, ta, ma
+ addi t0, sp, -64
+ addi t1, sp, -32
+ vsseg8e32.v v16, (t0)
+ vsseg8e32.v v24, (t1)
+
+ vsetvli a2, a2, e8, m8, ta, ma
+ vle8.v v0, (a1)
+ vle8.v v8, (t0)
+ vxor.vv v0, v0, v8
+ vse8.v v0, (a0)
+
+
+return:
+ ret
diff --git a/thirdparty/rvv-chacha-poly/vpoly.s b/thirdparty/rvv-chacha-poly/vpoly.s
new file mode 100644
index 0000000..a446b34
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/vpoly.s
@@ -0,0 +1,497 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License") ;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.global vector_poly1305
+# poly1305
+# Based on the obvious SIMD algorithm, described as Goll-Gueron here:
+# https://eprint.iacr.org/2019/842.pdf
+# Assumes VLEN is a power of 2, and that intermediate vsetvl will always return the max.
+# Hash is defined simply, for 32-byte key split between 16-byte s and r:
+# s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5
+# Performant implementations represent 130 bit numbers as 5 26-bit numbers.
+# Precomputation step:
+# Compute vector [r, r², r³, r⁴, ...] ( 5 32-bit vectors)
+# Compute scalar r^VLMAX (5 32-bit registers)
+# This can be done in 2*log2(VLMAX) multiplications:
+# i = 1; m = r; v = r
+# while i < VLMAX:
+# v *= m (masking out the last i elements)
+# m *= m
+# i <<= 1
+# Vector loop:
+# load segment (from the end) into 4 32-bit vectors
+# spread into standard 5 32-bit vector format
+# vector multiply into polynomial vector
+# vector add into sum so far
+# vector-scalar multiply polynomial vector with r^VLMAX
+# Extract:
+# vector sum reduce polynomial vector into scalar
+# add to s
+# extract 16-byte hash
+
+# TODO: implement this with C intrinsics in rvv_vector.h, as register
+# allocation is actually annoying.
+
+# Generic 130-bit multiply/mod code
+# Reads 5-limbed inputs from a and b, writes result to a
+# Uses 5 e64,m2 d registers for accumulation
+.macro vec_mul130 x a0 a1 a2 a3 a4 b0 b1 b2 b3 b4 b51 b52 b53 b54 d0 d1 d2 d3 d4 carry tmp v mask=""
+ # Helpful diagram from http://loup-vaillant.fr/tutorials/poly1305-design
+ # a4 a3 a2 a1 a0
+ # × b4 b3 b2 b1 b0
+ # ---------------------------------------
+ # a4×b0 a3×b0 a2×b0 a1×b0 a0×b0
+ # + a3×b1 a2×b1 a1×b1 a0×b1 5×a4×b1
+ # + a2×b2 a1×b2 a0×b2 5×a4×b2 5×a3×b2
+ # + a1×b3 a0×b3 5×a4×b3 5×a3×b3 5×a2×b3
+ # + a0×b4 5×a4×b4 5×a3×b4 5×a2×b4 5×a1×b4
+ # ---------------------------------------
+ # d4 d3 d2 d1 d0
+
+ # would it be more/less performant to do this by rows instead of columns?
+ # vectors pipelining without requiring stalls etc
+ # d0 column
+ vwmulu.\v \d0, \a0, \b0 \mask
+ vwmaccu.\v \d0, \b51, \a4 \mask
+ vwmaccu.\v \d0, \b52, \a3 \mask
+ vwmaccu.\v \d0, \b53, \a2 \mask
+ vwmaccu.\v \d0, \b54, \a1 \mask
+
+ # d1 column
+ vwmulu.\v \d1, \a1, \b0 \mask
+ vwmaccu.\v \d1, \b1, \a0 \mask
+ vwmaccu.\v \d1, \b52, \a4 \mask
+ vwmaccu.\v \d1, \b53, \a3 \mask
+ vwmaccu.\v \d1, \b54, \a2 \mask
+
+ # d2 column
+ vwmulu.\v \d2, \a2, \b0 \mask
+ vwmaccu.\v \d2, \b1, \a1 \mask
+ vwmaccu.\v \d2, \b2, \a0 \mask
+ vwmaccu.\v \d2, \b53, \a4 \mask
+ vwmaccu.\v \d2, \b54, \a3 \mask
+
+ # d3 column
+ vwmulu.\v \d3, \a3, \b0 \mask
+ vwmaccu.\v \d3, \b1, \a2 \mask
+ vwmaccu.\v \d3, \b2, \a1 \mask
+ vwmaccu.\v \d3, \b3, \a0 \mask
+ vwmaccu.\v \d3, \b54, \a4 \mask
+
+ # d4 column
+ vwmulu.\v \d4, \a4, \b0 \mask
+ vwmaccu.\v \d4, \b1, \a3 \mask
+ vwmaccu.\v \d4, \b2, \a2 \mask
+ vwmaccu.\v \d4, \b3, \a1 \mask
+ vwmaccu.\v \d4, \b4, \a0 \mask
+
+ # Carry propagation
+ # logic copied from https://github.com/floodyberry/poly1305-donna
+ li t0, 0x3ffffff
+ .macro carry_prop\x a d
+ vwaddu.wv \d, \d, \carry \mask
+ vnsrl.wi \carry, \d, 26 \mask
+ vnsrl.wi \a, \d, 0 \mask
+ vand.vx \a, \a, t0 \mask
+ .endm
+
+ vmv.v.i \carry, 0
+ carry_prop\x \a0, \d0
+ carry_prop\x \a1, \d1
+ carry_prop\x \a2, \d2
+ carry_prop\x \a3, \d3
+ carry_prop\x \a4, \d4
+
+ # wraparound carry continue
+ vsll.vi \tmp, \carry, 2 \mask
+ vadd.vv \a0, \a0, \tmp \mask
+ vadd.vv \a0, \a0, \carry \mask
+ # boring stops carrying here, but that fails random tests
+ vsrl.vi \carry, \a0, 26 \mask
+ vand.vx \a0, \a0, t0 \mask
+ vadd.vv \a1, \a1, \carry \mask
+
+ .endm
+
+# Scalar 130-bit a0-4 = a0-4 * a0-4
+.macro scalar_mul130 x a0 a1 a2 a3 a4 a51 a52 a53 a54 d0 d1 d2 d3 d4 carry tmp
+ # d0 column
+ mul \d0, \a1, \a54
+ mul \tmp, \a2, \a53
+ add \d0, \d0, \tmp
+ slli \d0, \d0, 1
+ mul \tmp, \a0, \a0
+ add \d0, \d0, \tmp
+
+ # d1 column
+ mul \d1, \a1, \a0
+ mul \tmp, \a2, \a54
+ add \d1, \d1, \tmp
+ slli \d1, \d1, 1
+ mul \tmp, \a53, \a3
+ add \d1, \d1, \tmp
+
+ # d2 column
+ mul \d2, \a2, \a0
+ mul \tmp, \a53, \a4
+ add \d2, \d2, \tmp
+ slli \d2, \d2, 1
+ mul \tmp, \a1, \a1
+ add \d2, \d2, \tmp
+
+ # d3 column
+ mul \d3, \a3, \a0
+ mul \tmp, \a1, \a2
+ add \d3, \d3, \tmp
+ slli \d3, \d3, 1
+ mul \tmp, \a54, \a4
+ add \d3, \d3, \tmp
+
+ # d4 column
+ mul \d4, \a4, \a0
+ mul \tmp, \a1, \a3
+ add \d4, \d4, \tmp
+ slli \d4, \d4, 1
+ mul \tmp, \a2, \a2
+ add \d4, \d4, \tmp
+
+ # Carry propagation
+ # logic copied from https://github.com/floodyberry/poly1305-donna
+ li \tmp, 0x3ffffff
+ .macro carry_prop_scalar\x a d
+ add \d, \d, \carry
+ srli \carry, \d, 26
+ and \a, \d, \tmp
+ .endm
+
+ li \carry, 0
+ carry_prop_scalar\x \a0, \d0
+ carry_prop_scalar\x \a1, \d1
+ carry_prop_scalar\x \a2, \d2
+ carry_prop_scalar\x \a3, \d3
+ carry_prop_scalar\x \a4, \d4
+
+ # wraparound carry continue
+ slli \tmp, \carry, 2
+ add \a0, \a0, \tmp
+ add \a0, \a0, \carry
+ # carry as much as the other mul code
+ srli \carry, \a0, 26
+ li \tmp, 0x3ffffff
+ and \a0, \a0, \tmp
+ add \a1, \a1, \carry
+
+ # Store a*5 registers for next time
+ slli \a51, \a1, 2
+ add \a51, \a51, \a1
+ slli \a52, \a2, 2
+ add \a52, \a52, \a2
+ slli \a53, \a3, 2
+ add \a53, \a53, \a3
+ slli \a54, \a4, 2
+ add \a54, \a54, \a4
+
+ .endm
+
+# Argument mappings
+# a0: const uint8_t* in
+# a1: size_t len
+# a2: const uint8_t[32] key
+# a3: uint8_t[16] sig
+# Register mappings (https://en.wikichip.org/wiki/risc-v/registers)
+# r^vlmax: s0, s1, s2, s3, s4
+# [r^vlmax, r^(vlmax-1), ... r^2, r]: v6, v7, v8, v9, v10
+# current accumulated vector state: v1, v2, v3, v4, v5
+vector_poly1305:
+ # save registers
+ sd s0, -8(sp)
+ sd s1, -16(sp)
+ sd s2, -24(sp)
+ sd s3, -32(sp)
+ sd s4, -40(sp)
+ sd s5, -48(sp)
+ sd s6, -56(sp)
+ sd s7, -64(sp)
+ sd s8, -72(sp)
+ sd s9, -80(sp)
+ sd s11, -88(sp)
+
+ # make sure input is a multiple of blocksize
+ andi t0, a1, 0xf
+ beq t0, zero, continue
+ li t0, 0x3713 # magic error number
+ sw t0, (a3)
+ j return
+continue:
+
+ # load R and spread to 5 26-bit limbs: s0-4
+ ld t0, 0(a2)
+ ld t1, 8(a2)
+ li t5, 0x0ffffffc0fffffff
+ and t0, t0, t5
+ li t5, 0x0ffffffc0ffffffc
+ and t1, t1, t5
+ li t5, 0x3ffffff
+ and s0, t0, t5
+ srli s1, t0, 26
+ and s1, s1, t5
+ srli s2, t0, 52
+ slli t0, t1, 12
+ or s2, s2, t0
+ and s2, s2, t5
+ srli s3, t1, 14
+ and s3, s3, t5
+ srli s4, t1, 40
+
+ # pre-multiplied-by-5 scalars
+ slli t4, s3, 2
+ add t4, t4, s3
+ slli t5, s4, 2
+ add t5, t5, s4
+
+ # a5 is vlmax-1 for e32m1
+ li t0, -1
+ vsetvli a5, t0, e32, m1, ta, mu
+ addi a5, a5, -1 # vlmax-1
+ # initialize vector to r^1
+ vmv.v.x v6, s0
+ vmv.v.x v7, s1
+ vmv.v.x v8, s2
+ vmv.v.x v9, s3
+ vmv.v.x v10, s4
+
+ # Do first iteration manually, as we can masked set r^2 instead of doing a second multiplication
+ # a4 is current exp
+ li a4, 1
+ # set alternating mask pattern
+ vid.v v1
+ vrsub.vx v1, v1, a5
+ vand.vx v1, v1, a4
+ vmseq.vx v0, v1, a4
+ slli a4, a4, 1
+
+ # scalar-scalar 130bit mul: s0-4 = s0-4 * s0-4
+ scalar_mul130 1 s0 s1 s2 s3 s4 t2 t3 t4 t5 s5 s6 s7 s8 s9 t0 t1
+
+ vmv.v.i v11, 0 # no vmv with mask, so vor with 0
+ vor.vx v6, v11, s0, v0.t
+ vor.vx v7, v11, s1, v0.t
+ vor.vx v8, v11, s2, v0.t
+ vor.vx v9, v11, s3, v0.t
+ vor.vx v10, v11, s4, v0.t
+
+precomp:
+ # compute mask (v0)
+ # exp-1: 7,6,5,4,3,2,1,0 (a5)
+ # r^1: 1,0,1,0,1,0,1,0
+ # r^2: 1,1,0,0,1,1,0,0
+ # r^4: 1,1,1,1,0,0,0,0
+ vid.v v1
+ vrsub.vx v1, v1, a5
+ vand.vx v1, v1, a4
+ vmseq.vx v0, v1, a4
+
+ # vector-scalar masked 130bit mul: v6-10 = v6-10 * s0-4
+ vec_mul130 vxm v6 v7 v8 v9 v10 s0 s1 s2 s3 s4 t2 t3 t4 t5 v12 v14 v16 v18 v20 v11 v22 vx ",v0.t"
+
+ # scalar-scalar 130bit mul: s0-4 = s0-4 * s0-4
+ scalar_mul130 2 s0 s1 s2 s3 s4 t2 t3 t4 t5 s5 s6 s7 s8 s9 t0 t1
+
+ # end of precomp loop:
+ slli a4, a4, 1 # double exponent
+ blt a4, a5, precomp
+
+ # store post-precomputation instruction counter
+ rdinstret s11
+
+
+ # From v11-14, separate out into 5 26-bit limbs: v20-v24
+ .macro vec_split5
+ li t0, 0x3ffffff
+ vand.vx v20, v11, t0
+ vsrl.vi v11, v11, 26
+ vsll.vi v31, v12, 6
+ vor.vv v11, v11, v31
+ vand.vx v21, v11, t0
+ vsrl.vi v12, v12, 20
+ vsll.vi v31, v13, 12
+ vor.vv v12, v12, v31
+ vand.vx v22, v12, t0
+ vsrl.vi v13, v13, 14
+ vsll.vi v31, v14, 18
+ vor.vv v13, v13, v31
+ vand.vx v23, v13, t0
+ vsrl.vi v24, v14, 8
+ .endm
+
+ # set up state as initial leading zero step
+ vmv.v.i v1, 0
+ vmv.v.i v2, 0
+ vmv.v.i v3, 0
+ vmv.v.i v4, 0
+ vmv.v.i v5, 0
+ # a1: bytes left
+ # a4: blocks left
+ srli a4, a1, 4
+ # t1: blocks in initial step
+ # use a full vector here, if blocks are a multiple of vector size
+ addi a4, a4, -1
+ and t1, a4, a5
+ addi a4, a4, 1
+ addi t1, t1, 1
+
+ vsetvli t1, t1, e32, m1, ta, ma
+ vlseg4e32.v v11, (a0)
+ # increment pointer
+ slli t0, t1, 4
+ add a0, a0, t0
+ sub a1, a1, t0
+ vec_split5
+ # add leading bit
+ # TODO: don't run vector version if we can't even fill the first vector
+ li t0, 1<<24
+ vor.vx v24, v24, t0
+
+ li t0, -1
+ vsetvli a5, t0, e32, m1, ta, ma
+ sub t0, a5, t1
+ slli a5, a5, 4 # block size in bytes
+ vslideup.vx v1, v20, t0
+ vslideup.vx v2, v21, t0
+ vslideup.vx v3, v22, t0
+ vslideup.vx v4, v23, t0
+ vslideup.vx v5, v24, t0
+
+
+vector_loop:
+ beq a1, zero, end_vector_loop
+
+ # multiply by r^vlmax
+ vec_mul130 vx v1 v2 v3 v4 v5 s0 s1 s2 s3 s4 t2 t3 t4 t5 v12 v14 v16 v18 v20 v11 v22 vx
+
+ # load in new data: v11-v14
+ vlseg4e32.v v11, (a0)
+ add a0, a0, a5
+ sub a1, a1, a5
+ vec_split5
+ # add leading bit
+ # TODO: support final non-full block correctly
+ li t0, 1<<24
+ vor.vx v24, v24, t0
+
+ # add into state
+ vadd.vv v1, v1, v20
+ vadd.vv v2, v2, v21
+ vadd.vv v3, v3, v22
+ vadd.vv v4, v4, v23
+ vadd.vv v5, v5, v24
+
+ j vector_loop
+end_vector_loop:
+
+ # multiply in [r^vlmax, r^(vlmax-1),... r^2, r]
+ vsll.vi v27, v7, 2
+ vadd.vv v27, v27, v7
+ vsll.vi v28, v8, 2
+ vadd.vv v28, v28, v8
+ vsll.vi v29, v9, 2
+ vadd.vv v29, v29, v9
+ vsll.vi v30, v10, 2
+ vadd.vv v30, v30, v10
+ vec_mul130 vv v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v27 v28 v29 v30 v12 v14 v16 v18 v20 v11 v22 vv
+
+ # vector reduction, into widened sum in case vector is huge
+ vmv.v.i v6, 0
+ vmv.v.i v7, 0
+ vmv.v.i v8, 0
+ vmv.v.i v9, 0
+ vmv.v.i v10, 0
+ vwredsum.vs v6, v1, v6
+ vwredsum.vs v7, v2, v7
+ vwredsum.vs v8, v3, v8
+ vwredsum.vs v9, v4, v9
+ vwredsum.vs v10, v5, v10
+ # extract to scalars
+ li t0, 1
+ vsetvli zero, t0, e64, m1, ta, ma
+ vmv.x.s s0, v6
+ vmv.x.s s1, v7
+ vmv.x.s s2, v8
+ vmv.x.s s3, v9
+ vmv.x.s s4, v10
+
+ # carry through
+ # t0=carry t1=mask
+ li t0, 0
+ li t1, 0x3ffffff
+ .macro carry_scalar s
+ add \s, \s, t0
+ srli t0, \s, 26
+ and \s, \s, t1
+ .endm
+
+ carry_scalar s0
+ carry_scalar s1
+ carry_scalar s2
+ carry_scalar s3
+ carry_scalar s4
+ # carry *= 5
+ slli t2, t0, 2
+ add t0, t0, t2
+ carry_scalar s0
+ carry_scalar s1
+ carry_scalar s2
+ carry_scalar s3
+ carry_scalar s4
+ # any remaining stuff to carry has to be in the 2 bits we don't care about, right?
+ bne t0, zero, return
+
+ # collapse into contiguous 128 bits (s0,s2)
+ slli t0, s1, 26
+ or s0, s0, t0
+ slli t0, s2, 52
+ or s0, s0, t0
+ srli s2, s2, 12
+ slli t0, s3, 14
+ or s2, s2, t0
+ slli t0, s4, 40
+ or s2, s2, t0
+
+ # add in other half of key (after the carry it seems)
+ ld t0, 16(a2)
+ ld t1, 24(a2)
+ add s0, s0, t0
+ sltu t0, s0, t0
+ add s2, s2, t0
+ add s2, s2, t1
+
+ # write final signature
+ sd s0, 0(a3)
+ sd s2, 8(a3)
+
+return:
+ # restore registers
+ mv a0, s11
+ ld s0, -8(sp)
+ ld s1, -16(sp)
+ ld s2, -24(sp)
+ ld s3, -32(sp)
+ ld s4, -40(sp)
+ ld s5, -48(sp)
+ ld s6, -56(sp)
+ ld s7, -64(sp)
+ ld s8, -72(sp)
+ ld s9, -80(sp)
+ ld s11, -88(sp)
+ ret