diff --git a/.gitmodules b/.gitmodules
index 9addb59..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "bench/rvv-chacha-poly"]
-	path = bench/rvv-chacha-poly
-	url = https://github.com/edre/rvv-chacha-poly
diff --git a/bench/chacha20.S b/bench/chacha20.S
index 1e84dca..038b954 100644
--- a/bench/chacha20.S
+++ b/bench/chacha20.S
@@ -1,5 +1,5 @@
 #ifndef MX
 #if __riscv_xlen != 32
-#include "rvv-chacha-poly/vchacha.s"
+#include "../thirdparty/rvv-chacha-poly/vchacha.s"
 #endif
 #endif
diff --git a/bench/chacha20.c b/bench/chacha20.c
index 00ccf6e..ea5cb3c 100644
--- a/bench/chacha20.c
+++ b/bench/chacha20.c
@@ -1,6 +1,6 @@
 #include "bench.h"
 #if __riscv_xlen != 32
-#include "../thirdparty/boring.h"
+#include "../thirdparty/rvv-chacha-poly/boring.h"
 
 uint8_t *dest, *src;
 uint8_t key[32], nonce[12];
@@ -52,7 +52,7 @@ Bench benches[] = {
 }; BENCH_MAIN(benches)
 
 
-#include "../thirdparty/boring.c"
+#include "../thirdparty/rvv-chacha-poly/boring.c"
 #else
 void init(void) {}
 Impl impls[] = {};
diff --git a/bench/poly1305.S b/bench/poly1305.S
index 8658971..3e8ac2d 100644
--- a/bench/poly1305.S
+++ b/bench/poly1305.S
@@ -1,5 +1,5 @@
 #ifndef MX
 #if __riscv_xlen != 32
-#include "rvv-chacha-poly/vpoly.s"
+#include "../thirdparty/rvv-chacha-poly/vpoly.s"
 #endif
 #endif
diff --git a/bench/poly1305.c b/bench/poly1305.c
index ef8fb1c..4a56c58 100644
--- a/bench/poly1305.c
+++ b/bench/poly1305.c
@@ -1,6 +1,6 @@
 #include "bench.h"
 #if __riscv_xlen != 32
-#include "../thirdparty/boring.h"
+#include "../thirdparty/rvv-chacha-poly/boring.h"
 
 uint8_t *src;
 uint8_t key[32], sig[16];
@@ -55,7 +55,7 @@ Bench benches[] = {
 }; BENCH_MAIN(benches)
 
 
-#include "../thirdparty/boring.c"
+#include "../thirdparty/rvv-chacha-poly/boring.c"
 #else
 void init(void) {}
 Impl impls[] = {};
diff --git a/bench/rvv-chacha-poly b/bench/rvv-chacha-poly
deleted file mode 160000
index 7cffc88..0000000
--- a/bench/rvv-chacha-poly
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7cffc882d35f36a355e83b35d9815f86e0a5598d
diff --git a/thirdparty/rvv-chacha-poly/CONTRIBUTING.md b/thirdparty/rvv-chacha-poly/CONTRIBUTING.md
new file mode 100644
index 0000000..a121ba3
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+# How to Contribute
+
+I can accept your patches and contributions to this project with the
+following caveats from my employer:
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Community Guidelines
+
+Treat people with respect.
diff --git a/thirdparty/rvv-chacha-poly/LICENSE b/thirdparty/rvv-chacha-poly/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/thirdparty/rvv-chacha-poly/README.md b/thirdparty/rvv-chacha-poly/README.md
new file mode 100644
index 0000000..d04a90e
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/README.md
@@ -0,0 +1,15 @@
+NOTE: code from https://github.com/edre/rvv-chacha-poly
+
+RISC-V vector extension implementation of chacha20 and poly1305
+cryptographic primitives.
+
+Chacha20 and poly1305 are simple to vectorize without specialized
+instructions. This project implements them in assembly, and verifies them
+against the BoringSSL C implementation. As expected the executed instruction
+count go down a lot, but I don't have real hardware to see if the runtime does
+too.
+
+This is not an officially supported Google product.
+
+This is a proof of concept crypto library. Those words should sound very scary
+together. Don't use this.
diff --git a/thirdparty/boring.c b/thirdparty/rvv-chacha-poly/boring.c
similarity index 100%
rename from thirdparty/boring.c
rename to thirdparty/rvv-chacha-poly/boring.c
diff --git a/thirdparty/boring.h b/thirdparty/rvv-chacha-poly/boring.h
similarity index 100%
rename from thirdparty/boring.h
rename to thirdparty/rvv-chacha-poly/boring.h
diff --git a/thirdparty/rvv-chacha-poly/main.c b/thirdparty/rvv-chacha-poly/main.c
new file mode 100644
index 0000000..265a9b8
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/main.c
@@ -0,0 +1,195 @@
+/* Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License") ;
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "boring.h"
+
+void println_hex(uint8_t* data, int size) {
+  while (size > 0) {
+    printf("%02x", *data);
+    data++;
+    size--;
+  }
+  printf("\n");
+}
+
+// TODO: test the vector doesn't write past the end
+// test function with multiple length inputs (optional printing)
+// test non-block sized lengths
+
+extern uint64_t instruction_counter();
+
+const char* pass_str = "\x1b[32mPASS\x1b[0m";
+const char* fail_str = "\x1b[31mFAIL\x1b[0m";
+
+bool test_chacha(const uint8_t* data, size_t len, const uint8_t key[32], const uint8_t nonce[12], bool verbose) {
+  extern void vector_chacha20(uint8_t *out, const uint8_t *in,
+			      size_t in_len, const uint8_t key[32],
+			      const uint8_t nonce[12], uint32_t counter);
+  uint8_t* golden = malloc(len);
+  memset(golden, 0, len);
+  uint64_t start = instruction_counter();
+  boring_chacha20(golden, data, len, key, nonce, 0);
+  uint64_t end = instruction_counter();
+  uint64_t boring_count = end - start;
+
+  uint8_t* vector = malloc(len + 4);
+  memset(vector, 0, len+4);
+  start = instruction_counter();
+  vector_chacha20(vector, data, len, key, nonce, 0);
+  end = instruction_counter();
+
+  bool pass = memcmp(golden, vector, len) == 0;
+
+  if (verbose || !pass) {
+    printf("golden: ");
+    println_hex(golden, 32);
+    printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len);
+    printf("vector: ");
+    println_hex(vector, 32);
+    printf("inst_count=%d, inst/byte=%.02f\n", end - start, (float)(end - start)/len);
+  }
+
+  uint32_t past_end = vector[len];
+  if (past_end != 0) {
+    printf("vector wrote past end %08x\n", past_end);
+    pass = false;
+  }
+
+  free(golden);
+  free(vector);
+
+  return pass;
+}
+
+void test_chachas(FILE* f) {
+  int len = 1024 - 11;
+  uint8_t* data = malloc(len);
+  uint32_t rand = 1;
+  for (int i = 0; i < len; i++) {
+    rand *= 101;
+    rand %= 16777213; // random prime
+    data[i] = (uint8_t)(rand);
+  }
+  uint8_t key[32] = "Setec astronomy;too many secrets";
+  uint8_t nonce[12] = "BurnAfterUse";
+  int counter = 0;
+
+  bool pass = test_chacha(data, len, key, nonce, true);
+
+  if (pass) {
+    for (int i = 1, len = 1; len < 1000; len += i++) {
+      fread(key, 32, 1, f);
+      fread(nonce, 12, 1, f);
+      if (!test_chacha(data, len, key, nonce, false)) {
+	printf("Failed with len=%d\n", len);
+	pass = false;
+	break;
+      }
+    }
+  }
+
+  if (pass) {
+    printf("chacha %s\n", pass_str);
+  } else {
+    printf("chacha %s\n", fail_str);
+  }
+}
+
+bool test_poly(const uint8_t* data, size_t len, const uint8_t key[32], bool verbose) {
+  extern uint64_t vector_poly1305(const uint8_t* in, size_t len,
+				  const uint8_t key[32], uint8_t sig[16]);
+
+  poly1305_state state;
+  uint8_t *sig = malloc(16); // gets corrupted if I define it on the stack?
+  uint64_t start = instruction_counter();
+  boring_poly1305_init(&state, key);
+  boring_poly1305_update(&state, data, len);
+  boring_poly1305_finish(&state, sig);
+  uint64_t end = instruction_counter();
+  uint64_t boring_count = end - start;
+
+  uint8_t *sig2 = malloc(16);
+  start = instruction_counter();
+  uint64_t mid = vector_poly1305(data, len, key, sig2);
+  end = instruction_counter();
+
+  bool pass = memcmp(sig, sig2, 16) == 0;
+
+  if (verbose || !pass) {
+    printf("boring mac: ");
+    println_hex(sig, 16);
+    printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len);
+    printf("vector mac: ");
+    println_hex(sig2, 16);
+    printf("precomputation=%d, processing=%d, inst/byte=%.02f\n",
+	   mid - start, end - mid, (float)(end - mid)/len);
+  }
+
+  free(sig);
+  free(sig2);
+  return pass;
+}
+
+void test_polys(FILE* f) {
+  const int big_len = 1024;
+  uint8_t *zero = malloc(2000);
+  uint8_t *max_bits = malloc(big_len);
+  memset(max_bits, 0xff, big_len);
+  const uint8_t one[32] = {1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  const uint8_t key[32] = {1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 255,
+  			   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  const uint8_t data[272] = "Setec astronomy;too many secrets";
+  bool pass = test_poly(max_bits, big_len, max_bits, true);
+
+  if (!pass)
+    goto end;
+
+  // random test
+  const int max_len = 1000;
+  uint8_t *rand = malloc(max_len);
+  for (int len = 16; len <= max_len; len += 16) {
+    fread((uint8_t*)key, 32, 1, f);
+    fread((uint8_t*)rand, len, 1, f);
+    if (!test_poly(data, len, key, false)) {
+      printf("failed random input len=%d\n", len);
+      pass = false;
+      break;
+    }
+  }
+  free(rand);
+
+ end:
+  if (pass) {
+    printf("poly %s\n", pass_str);
+  } else {
+    printf("poly %s\n", fail_str);
+  }
+
+  free(zero);
+  free(max_bits);
+}
+
+int main(int argc, uint8_t *argv[]) {
+  extern uint32_t vlmax_u32();
+  printf("VLMAX in blocks: %d\n", vlmax_u32());
+  FILE* rand = fopen("/dev/urandom", "r");
+  test_chachas(rand);
+  printf("\n");
+  test_polys(rand);
+  fclose(rand);
+}
diff --git a/thirdparty/rvv-chacha-poly/test.sh b/thirdparty/rvv-chacha-poly/test.sh
new file mode 100755
index 0000000..6151fff
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/test.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License") ;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Dependencies to be installed and on the PATH:
+# https://github.com/riscv/riscv-gnu-toolchain
+# https://github.com/riscv/riscv-isa-sim
+#   configure --prefix=$RISCV --with-varch=v512:e64
+# https://github.com/riscv/riscv-pk
+
+ISA=rv64gcv
+
+riscv64-unknown-elf-gcc -march=$ISA main.c boring.c vchacha.s vpoly.s -o main -O &&
+    spike --isa=$ISA `which pk` main
diff --git a/thirdparty/rvv-chacha-poly/vchacha.s b/thirdparty/rvv-chacha-poly/vchacha.s
new file mode 100644
index 0000000..e09696d
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/vchacha.s
@@ -0,0 +1,267 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License") ;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.global instruction_counter
+.global vector_chacha20
+.global vlmax_u32
+
+instruction_counter:
+	rdinstret a0
+	ret
+
+vlmax_u32:
+	vsetvli a0, x0, e32, m1, ta, ma
+	ret
+
+
+# Cell-based implementation strategy:
+# v0-v15: Cell vectors. Each element is from a different block
+
+## Function initialization
+# Using the same order as the boring chacha arguments:
+# a0 = uint8_t *out
+# a1 = uint8_t *in
+# a2 = size_t in_len
+# a3 = uint8_t key[32]
+# a4 = uint8_t nonce[12]
+# a5 = uint32_t counter
+vector_chacha20:
+	# a2 = initial length in bytes
+	# t3 = remaining 64-byte blocks to mix
+	# t4 = remaining full blocks to read/write
+	#  (if t3 and t4 are different by one, there is a partial block to manually xor)
+	# t1 = vl in 64-byte blocks
+	srli t4, a2, 6
+	addi t0, a2, 63
+	srli t3, t0, 6
+encrypt_blocks:
+	# initialize vector state
+	vsetvli t1, t3, e32, m1, ta, ma
+	# Load 128 bit constant
+	li t0, 0x61707865 # "expa" little endian
+	vmv.v.x v0, t0
+	li t0, 0x3320646e # "nd 3" little endian
+	vmv.v.x v1, t0
+	li t0, 0x79622d32 # "2-by" little endian
+	vmv.v.x v2, t0
+	li t0, 0x6b206574 # "te k" little endian
+	vmv.v.x v3, t0
+	# Load key
+	lw t0, 0(a3)
+	vmv.v.x v4, t0
+	lw t0, 4(a3)
+	vmv.v.x v5, t0
+	lw t0, 8(a3)
+	vmv.v.x v6, t0
+	lw t0, 12(a3)
+	vmv.v.x v7, t0
+	lw t0, 16(a3)
+	vmv.v.x v8, t0
+	lw t0, 20(a3)
+	vmv.v.x v9, t0
+	lw t0, 24(a3)
+	vmv.v.x v10, t0
+	lw t0, 28(a3)
+	vmv.v.x v11, t0
+	# Load counter, and increment for each element
+	vid.v v12
+	vadd.vx v12, v12, a5
+	# Load nonce
+	lw t0, 0(a4)
+	vmv.v.x v13, t0
+	lw t0, 4(a4)
+	vmv.v.x v14, t0
+	lw t0, 8(a4)
+	vmv.v.x v15, t0
+
+	li t2, 10 # loop counter
+round_loop:
+
+	.macro vrotl a i r
+#if __riscv_zvbb
+	vror.vi \a, \a, 32-\i
+#else
+	vsll.vi v16, \a, \i
+	vsrl.vi \a, \a, 32-\i
+	vor.vv \a, \a, v16
+#endif
+	.endm
+
+	.macro quarterround a b c d
+	# a += b; d ^= a; d <<<= 16;
+	vadd.vv \a, \a, \b
+	vxor.vv \d, \d, \a
+	vrotl \d, 16, t6
+	# c += d; b ^= c; b <<<= 12;
+	vadd.vv \c, \c, \d
+	vxor.vv \b, \b, \c
+	vrotl \b, 12, t7
+	# a += b; d ^= a; d <<<= 8;
+	vadd.vv \a, \a, \b
+	vxor.vv \d, \d, \a
+	vrotl \d, 8, t8
+	# c += d; b ^= c; b <<<= 7;
+	vadd.vv \c, \c, \d
+	vxor.vv \b, \b, \c
+	vrotl \b, 7, t9
+	.endm
+
+	# Mix columns.
+	quarterround v0, v4, v8, v12
+	quarterround v1, v5, v9, v13
+	quarterround v2, v6, v10, v14
+	quarterround v3, v7, v11, v15
+	# Mix diagonals.
+	quarterround v0, v5, v10, v15
+	quarterround v1, v6, v11, v12
+	quarterround v2, v7, v8, v13
+	quarterround v3, v4, v9, v14
+
+	addi t2, t2, -1
+	bnez t2, round_loop
+
+	# Add in initial block values.
+	# 128 bit constant
+	li t0, 0x61707865 # "expa" little endian
+	vadd.vx v0, v0, t0
+	li t0, 0x3320646e # "nd 3" little endian
+	vadd.vx v1, v1, t0
+	li t0, 0x79622d32 # "2-by" little endian
+	vadd.vx v2, v2, t0
+	li t0, 0x6b206574 # "te k" little endian
+	vadd.vx v3, v3, t0
+	# Add key
+	lw t0, 	0(a3)
+	vadd.vx v4, v4, t0
+	lw t0, 4(a3)
+	vadd.vx v5, v5, t0
+	lw t0, 8(a3)
+	vadd.vx v6, v6, t0
+	lw t0, 12(a3)
+	vadd.vx v7, v7, t0
+	lw t0, 16(a3)
+	vadd.vx v8, v8, t0
+	lw t0, 20(a3)
+	vadd.vx v9, v9, t0
+	lw t0, 24(a3)
+	vadd.vx v10, v10, t0
+	lw t0, 28(a3)
+	vadd.vx v11, v11, t0
+	# Add counter
+	vid.v v16
+	vadd.vv v12, v12, v16
+	vadd.vx v12, v12, a5
+	# Load nonce
+	lw t0, 0(a4)
+	vadd.vx v13, v13, t0
+	lw t0, 4(a4)
+	vadd.vx v14, v14, t0
+	lw t0, 8(a4)
+	vadd.vx v15, v15, t0
+
+	# load in vector lanes with two strided segment loads
+	# in case this is the final block, reset vl to full blocks
+	vsetvli t5, t4, e32, m1, ta, ma
+	li t0, 64
+	vlsseg8e32.v v16, (a1), t0
+	add a1, a1, 32
+	vlsseg8e32.v v24, (a1), t0
+	add a1, a1, -32
+
+	# xor in state
+	vxor.vv v16, v16, v0
+	vxor.vv v17, v17, v1
+	vxor.vv v18, v18, v2
+	vxor.vv v19, v19, v3
+	vxor.vv v20, v20, v4
+	vxor.vv v21, v21, v5
+	vxor.vv v22, v22, v6
+	vxor.vv v23, v23, v7
+	vxor.vv v24, v24, v8
+	vxor.vv v25, v25, v9
+	vxor.vv v26, v26, v10
+	vxor.vv v27, v27, v11
+	vxor.vv v28, v28, v12
+	vxor.vv v29, v29, v13
+	vxor.vv v30, v30, v14
+	vxor.vv v31, v31, v15
+
+	# write back out with 2 strided segment stores
+	vssseg8e32.v v16, (a0), t0
+	add a0, a0, 32
+	vssseg8e32.v v24, (a0), t0
+	add a0, a0, -32
+
+	# update counters/pointers
+	slli t0, t5, 6 # current VL in bytes
+	add a0, a0, t0 # advance output pointer
+	add a1, a1, t0 # advance input pointer
+	sub a2, a2, t0 # decrement remaining bytes
+	sub t3, t3, t1 # decrement remaining blocks
+	sub t4, t4, t1 # decrement remaining blocks
+	# TODO: crash if counter overflows
+	add a5, a5, t1 # increment counter
+
+	# loop again if we have remaining blocks
+	bne x0, t3, encrypt_blocks
+
+	# we're done if there are no more remaining bytes from a partial block
+	beq zero, a2, return
+
+	# to get the remaining partial block, we transfer the nth element of
+	# all the state vectors into contiguous stack memory with vsseg, then
+	# read them with byte-granularity vl
+
+	# reconstruct vl for all computed blocks
+	add t0, t3, t1
+	vsetvli t0, t0, e32, m1, ta, ma
+	add t0, t0, -1
+
+	#vse.v v4, (a0)
+	#j return
+
+	# use a masked vsseg instead of sliding everything down?
+	# both options seem like they might touch a lot of vector state...
+	vslidedown.vx v16, v0, t0
+	vslidedown.vx v17, v1, t0
+	vslidedown.vx v18, v2, t0
+	vslidedown.vx v19, v3, t0
+	vslidedown.vx v20, v4, t0
+	vslidedown.vx v21, v5, t0
+	vslidedown.vx v22, v6, t0
+	vslidedown.vx v23, v7, t0
+	vslidedown.vx v24, v8, t0
+	vslidedown.vx v25, v9, t0
+	vslidedown.vx v26, v10, t0
+	vslidedown.vx v27, v11, t0
+	vslidedown.vx v28, v12, t0
+	vslidedown.vx v29, v13, t0
+	vslidedown.vx v30, v14, t0
+	vslidedown.vx v31, v15, t0
+	li t0, 1
+	vsetvli zero, t0, e32, m1, ta, ma
+	addi t0, sp, -64
+	addi t1, sp, -32
+	vsseg8e32.v v16, (t0)
+	vsseg8e32.v v24, (t1)
+
+	vsetvli a2, a2, e8, m8, ta, ma
+	vle8.v v0, (a1)
+	vle8.v v8, (t0)
+	vxor.vv v0, v0, v8
+	vse8.v v0, (a0)
+
+
+return:
+	ret
diff --git a/thirdparty/rvv-chacha-poly/vpoly.s b/thirdparty/rvv-chacha-poly/vpoly.s
new file mode 100644
index 0000000..a446b34
--- /dev/null
+++ b/thirdparty/rvv-chacha-poly/vpoly.s
@@ -0,0 +1,497 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License") ;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.global vector_poly1305
+# poly1305
+# Based on the obvious SIMD algorithm, described as Goll-Gueron here:
+# https://eprint.iacr.org/2019/842.pdf
+# Assumes VLEN is a power of 2, and that intermediate vsetvl will always return the max.
+# Hash is defined simply, for 32-byte key split between 16-byte s and r:
+# s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r  mod  2¹³⁰ - 5
+# Performant implementations represent 130 bit numbers as 5 26-bit numbers.
+# Precomputation step:
+#   Compute vector [r, r², r³, r⁴, ...] ( 5 32-bit vectors)
+#   Compute scalar r^VLMAX (5 32-bit registers)
+#   This can be done in 2*log2(VLMAX) multiplications:
+#   i = 1; m = r; v = r
+#   while i < VLMAX:
+#       v *= m (masking out the last i elements)
+#       m *= m
+#       i <<= 1
+# Vector loop:
+#   load segment (from the end) into 4 32-bit vectors
+#   spread into standard 5 32-bit vector format
+#   vector multiply into polynomial vector
+#   vector add into sum so far
+#   vector-scalar multiply polynomial vector with r^VLMAX
+# Extract:
+#   vector sum reduce polynomial vector into scalar
+#   add to s
+#   extract 16-byte hash
+
+# TODO: implement this with C intrinsics in rvv_vector.h, as register
+# allocation is actually annoying.
+
+# Generic 130-bit multiply/mod code
+# Reads 5-limbed inputs from a and b, writes result to a
+# Uses 5 e64,m2 d registers for accumulation
+.macro vec_mul130 x a0 a1 a2 a3 a4 b0 b1 b2 b3 b4 b51 b52 b53 b54 d0 d1 d2 d3 d4 carry tmp v mask=""
+	# Helpful diagram from http://loup-vaillant.fr/tutorials/poly1305-design
+	#      a4      a3      a2      a1      a0
+	# ×    b4      b3      b2      b1      b0
+	# ---------------------------------------
+	#   a4×b0   a3×b0   a2×b0   a1×b0   a0×b0
+	# + a3×b1   a2×b1   a1×b1   a0×b1 5×a4×b1
+	# + a2×b2   a1×b2   a0×b2 5×a4×b2 5×a3×b2
+	# + a1×b3   a0×b3 5×a4×b3 5×a3×b3 5×a2×b3
+	# + a0×b4 5×a4×b4 5×a3×b4 5×a2×b4 5×a1×b4
+	# ---------------------------------------
+	#      d4      d3      d2      d1      d0
+
+	# would it be more/less performant to do this by rows instead of columns?
+	# vectors pipelining without requiring stalls etc
+	# d0 column
+	vwmulu.\v \d0, \a0, \b0 \mask
+	vwmaccu.\v \d0, \b51, \a4 \mask
+	vwmaccu.\v \d0, \b52, \a3 \mask
+	vwmaccu.\v \d0, \b53, \a2 \mask
+	vwmaccu.\v \d0, \b54, \a1 \mask
+
+	# d1 column
+	vwmulu.\v \d1, \a1, \b0 \mask
+	vwmaccu.\v \d1, \b1, \a0 \mask
+	vwmaccu.\v \d1, \b52, \a4 \mask
+	vwmaccu.\v \d1, \b53, \a3 \mask
+	vwmaccu.\v \d1, \b54, \a2 \mask
+
+	# d2 column
+	vwmulu.\v \d2, \a2, \b0 \mask
+	vwmaccu.\v \d2, \b1, \a1 \mask
+	vwmaccu.\v \d2, \b2, \a0 \mask
+	vwmaccu.\v \d2, \b53, \a4 \mask
+	vwmaccu.\v \d2, \b54, \a3 \mask
+
+	# d3 column
+	vwmulu.\v \d3, \a3, \b0 \mask
+	vwmaccu.\v \d3, \b1, \a2 \mask
+	vwmaccu.\v \d3, \b2, \a1 \mask
+	vwmaccu.\v \d3, \b3, \a0 \mask
+	vwmaccu.\v \d3, \b54, \a4 \mask
+
+	# d4 column
+	vwmulu.\v \d4, \a4, \b0 \mask
+	vwmaccu.\v \d4, \b1, \a3 \mask
+	vwmaccu.\v \d4, \b2, \a2 \mask
+	vwmaccu.\v \d4, \b3, \a1 \mask
+	vwmaccu.\v \d4, \b4, \a0 \mask
+
+	# Carry propagation
+	# logic copied from https://github.com/floodyberry/poly1305-donna
+	li t0, 0x3ffffff
+	.macro carry_prop\x a d
+	vwaddu.wv \d, \d, \carry \mask
+	vnsrl.wi \carry, \d, 26 \mask
+	vnsrl.wi \a, \d, 0 \mask
+	vand.vx \a, \a, t0 \mask
+	.endm
+
+	vmv.v.i \carry, 0
+	carry_prop\x \a0, \d0
+	carry_prop\x \a1, \d1
+	carry_prop\x \a2, \d2
+	carry_prop\x \a3, \d3
+	carry_prop\x \a4, \d4
+
+	# wraparound carry continue
+	vsll.vi \tmp, \carry, 2 \mask
+	vadd.vv \a0, \a0, \tmp \mask
+	vadd.vv \a0, \a0, \carry \mask
+	# boring stops carrying here, but that fails random tests
+	vsrl.vi \carry, \a0, 26 \mask
+	vand.vx \a0, \a0, t0 \mask
+	vadd.vv \a1, \a1, \carry \mask
+
+	.endm
+
+# Scalar 130-bit a0-4 = a0-4 * a0-4
+.macro scalar_mul130 x a0 a1 a2 a3 a4 a51 a52 a53 a54 d0 d1 d2 d3 d4 carry tmp
+	# d0 column
+	mul \d0, \a1, \a54
+	mul \tmp, \a2, \a53
+	add \d0, \d0, \tmp
+	slli \d0, \d0, 1
+	mul \tmp, \a0, \a0
+	add \d0, \d0, \tmp
+
+	# d1 column
+	mul \d1, \a1, \a0
+	mul \tmp, \a2, \a54
+	add \d1, \d1, \tmp
+	slli \d1, \d1, 1
+	mul \tmp, \a53, \a3
+	add \d1, \d1, \tmp
+
+	# d2 column
+	mul \d2, \a2, \a0
+	mul \tmp, \a53, \a4
+	add \d2, \d2, \tmp
+	slli \d2, \d2, 1
+	mul \tmp, \a1, \a1
+	add \d2, \d2, \tmp
+
+	# d3 column
+	mul \d3, \a3, \a0
+	mul \tmp, \a1, \a2
+	add \d3, \d3, \tmp
+	slli \d3, \d3, 1
+	mul \tmp, \a54, \a4
+	add \d3, \d3, \tmp
+
+	# d4 column
+	mul \d4, \a4, \a0
+	mul \tmp, \a1, \a3
+	add \d4, \d4, \tmp
+	slli \d4, \d4, 1
+	mul \tmp, \a2, \a2
+	add \d4, \d4, \tmp
+
+	# Carry propagation
+	# logic copied from https://github.com/floodyberry/poly1305-donna
+	li \tmp, 0x3ffffff
+	.macro carry_prop_scalar\x a d
+	add \d, \d, \carry
+	srli \carry, \d, 26
+	and \a, \d, \tmp
+	.endm
+
+	li \carry, 0
+	carry_prop_scalar\x \a0, \d0
+	carry_prop_scalar\x \a1, \d1
+	carry_prop_scalar\x \a2, \d2
+	carry_prop_scalar\x \a3, \d3
+	carry_prop_scalar\x \a4, \d4
+
+	# wraparound carry continue
+	slli \tmp, \carry, 2
+	add \a0, \a0, \tmp
+	add \a0, \a0, \carry
+	# carry as much as the other mul code
+	srli \carry, \a0, 26
+	li \tmp, 0x3ffffff
+	and \a0, \a0, \tmp
+	add \a1, \a1, \carry
+
+	# Store a*5 registers for next time
+	slli \a51, \a1, 2
+	add \a51, \a51, \a1
+	slli \a52, \a2, 2
+	add \a52, \a52, \a2
+	slli \a53, \a3, 2
+	add \a53, \a53, \a3
+	slli \a54, \a4, 2
+	add \a54, \a54, \a4
+
+	.endm
+
+# Argument mappings
+# a0: const uint8_t* in
+# a1: size_t len
+# a2: const uint8_t[32] key
+# a3: uint8_t[16] sig
+# Register mappings (https://en.wikichip.org/wiki/risc-v/registers)
+# r^vlmax: s0, s1, s2, s3, s4
+# [r^vlmax, r^(vlmax-1), ... r^2, r]: v6, v7, v8, v9, v10
+# current accumulated vector state: v1, v2, v3, v4, v5
+vector_poly1305:
+	# save registers
+	sd s0, -8(sp)
+	sd s1, -16(sp)
+	sd s2, -24(sp)
+	sd s3, -32(sp)
+	sd s4, -40(sp)
+	sd s5, -48(sp)
+	sd s6, -56(sp)
+	sd s7, -64(sp)
+	sd s8, -72(sp)
+	sd s9, -80(sp)
+	sd s11, -88(sp)
+
+	# make sure input is a multiple of blocksize
+	andi t0, a1, 0xf
+	beq t0, zero, continue
+	li t0, 0x3713 # magic error number
+	sw t0, (a3)
+	j return
+continue:
+
+	# load R and spread to 5 26-bit limbs: s0-4
+	ld t0, 0(a2)
+	ld t1, 8(a2)
+	li t5, 0x0ffffffc0fffffff
+	and t0, t0, t5
+	li t5, 0x0ffffffc0ffffffc
+	and t1, t1, t5
+	li t5, 0x3ffffff
+	and s0, t0, t5
+	srli s1, t0, 26
+	and s1, s1, t5
+	srli s2, t0, 52
+	slli t0, t1, 12
+	or s2, s2, t0
+	and s2, s2, t5
+	srli s3, t1, 14
+	and s3, s3, t5
+	srli s4, t1, 40
+
+	# pre-multiplied-by-5 scalars
+	slli t4, s3, 2
+	add t4, t4, s3
+	slli t5, s4, 2
+	add t5, t5, s4
+
+	# a5 is vlmax-1 for e32m1
+	li t0, -1
+	vsetvli a5, t0, e32, m1, ta, mu
+	addi a5, a5, -1 # vlmax-1
+	# initialize vector to r^1
+	vmv.v.x v6, s0
+	vmv.v.x v7, s1
+	vmv.v.x v8, s2
+	vmv.v.x v9, s3
+	vmv.v.x v10, s4
+
+	# Do first iteration manually, as we can masked set r^2 instead of doing a second multiplication
+	# a4 is current exp
+	li a4, 1
+	# set alternating mask pattern
+	vid.v v1
+	vrsub.vx v1, v1, a5
+	vand.vx v1, v1, a4
+	vmseq.vx v0, v1, a4
+	slli a4, a4, 1
+
+	# scalar-scalar 130bit mul: s0-4 = s0-4 * s0-4
+	scalar_mul130 1 s0 s1 s2 s3 s4 t2 t3 t4 t5 s5 s6 s7 s8 s9 t0 t1
+
+	vmv.v.i v11, 0 # no vmv with mask, so vor with 0
+	vor.vx v6, v11, s0, v0.t
+	vor.vx v7, v11, s1, v0.t
+	vor.vx v8, v11, s2, v0.t
+	vor.vx v9, v11, s3, v0.t
+	vor.vx v10, v11, s4, v0.t
+
+precomp:
+	# compute mask (v0)
+	# exp-1: 7,6,5,4,3,2,1,0 (a5)
+	# r^1:   1,0,1,0,1,0,1,0
+	# r^2:   1,1,0,0,1,1,0,0
+	# r^4:   1,1,1,1,0,0,0,0
+	vid.v v1
+	vrsub.vx v1, v1, a5
+	vand.vx v1, v1, a4
+	vmseq.vx v0, v1, a4
+
+	# vector-scalar masked 130bit mul: v6-10 = v6-10 * s0-4
+	vec_mul130 vxm v6 v7 v8 v9 v10 s0 s1 s2 s3 s4 t2 t3 t4 t5 v12 v14 v16 v18 v20 v11 v22 vx ",v0.t"
+
+	# scalar-scalar 130bit mul: s0-4 = s0-4 * s0-4
+	scalar_mul130 2 s0 s1 s2 s3 s4 t2 t3 t4 t5 s5 s6 s7 s8 s9 t0 t1
+
+	# end of precomp loop:
+	slli a4, a4, 1 # double exponent
+	blt a4, a5, precomp
+
+	# store post-precomputation instruction counter
+	rdinstret s11
+
+
+	# From v11-14, separate out into 5 26-bit limbs: v20-v24
+	.macro vec_split5
+	li t0, 0x3ffffff
+	vand.vx v20, v11, t0
+	vsrl.vi v11, v11, 26
+	vsll.vi v31, v12, 6
+	vor.vv v11, v11, v31
+	vand.vx v21, v11, t0
+	vsrl.vi v12, v12, 20
+	vsll.vi v31, v13, 12
+	vor.vv v12, v12, v31
+	vand.vx v22, v12, t0
+	vsrl.vi v13, v13, 14
+	vsll.vi v31, v14, 18
+	vor.vv v13, v13, v31
+	vand.vx v23, v13, t0
+	vsrl.vi v24, v14, 8
+	.endm
+
+	# set up state as initial leading zero step
+	vmv.v.i v1, 0
+	vmv.v.i v2, 0
+	vmv.v.i v3, 0
+	vmv.v.i v4, 0
+	vmv.v.i v5, 0
+	# a1: bytes left
+	# a4: blocks left
+	srli a4, a1, 4
+	# t1: blocks in initial step
+	# use a full vector here, if blocks are a multiple of vector size
+	addi a4, a4, -1
+	and t1, a4, a5
+	addi a4, a4, 1
+	addi t1, t1, 1
+
+	vsetvli t1, t1, e32, m1, ta, ma
+	vlseg4e32.v v11, (a0)
+	# increment pointer
+	slli t0, t1, 4
+	add a0, a0, t0
+	sub a1, a1, t0
+	vec_split5
+	# add leading bit
+	# TODO: don't run vector version if we can't even fill the first vector
+	li t0, 1<<24
+	vor.vx v24, v24, t0
+
+	li t0, -1
+	vsetvli a5, t0, e32, m1, ta, ma
+	sub t0, a5, t1
+	slli a5, a5, 4 # block size in bytes
+	vslideup.vx v1, v20, t0
+	vslideup.vx v2, v21, t0
+	vslideup.vx v3, v22, t0
+	vslideup.vx v4, v23, t0
+	vslideup.vx v5, v24, t0
+
+
+vector_loop:
+	beq a1, zero, end_vector_loop
+
+	# multiply by r^vlmax
+	vec_mul130 vx v1 v2 v3 v4 v5 s0 s1 s2 s3 s4 t2 t3 t4 t5 v12 v14 v16 v18 v20 v11 v22 vx
+
+	# load in new data: v11-v14
+	vlseg4e32.v v11, (a0)
+	add a0, a0, a5
+	sub a1, a1, a5
+	vec_split5
+	# add leading bit
+	# TODO: support final non-full block correctly
+	li t0, 1<<24
+	vor.vx v24, v24, t0
+
+	# add into state
+	vadd.vv v1, v1, v20
+	vadd.vv v2, v2, v21
+	vadd.vv v3, v3, v22
+	vadd.vv v4, v4, v23
+	vadd.vv v5, v5, v24
+
+	j vector_loop
+end_vector_loop:
+
+	# multiply in [r^vlmax, r^(vlmax-1),... r^2, r]
+	vsll.vi v27, v7, 2
+	vadd.vv v27, v27, v7
+	vsll.vi v28, v8, 2
+	vadd.vv v28, v28, v8
+	vsll.vi v29, v9, 2
+	vadd.vv v29, v29, v9
+	vsll.vi v30, v10, 2
+	vadd.vv v30, v30, v10
+	vec_mul130 vv v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v27 v28 v29 v30 v12 v14 v16 v18 v20 v11 v22 vv
+
+	# vector reduction, into widened sum in case vector is huge
+	vmv.v.i v6, 0
+	vmv.v.i v7, 0
+	vmv.v.i v8, 0
+	vmv.v.i v9, 0
+	vmv.v.i v10, 0
+	vwredsum.vs v6, v1, v6
+	vwredsum.vs v7, v2, v7
+	vwredsum.vs v8, v3, v8
+	vwredsum.vs v9, v4, v9
+	vwredsum.vs v10, v5, v10
+	# extract to scalars
+	li t0, 1
+	vsetvli zero, t0, e64, m1, ta, ma
+	vmv.x.s s0, v6
+	vmv.x.s s1, v7
+	vmv.x.s s2, v8
+	vmv.x.s s3, v9
+	vmv.x.s s4, v10
+
+	# carry through
+	# t0=carry t1=mask
+	li t0, 0
+	li t1, 0x3ffffff
+	.macro carry_scalar s
+	add \s, \s, t0
+	srli t0, \s, 26
+	and \s, \s, t1
+	.endm
+
+	carry_scalar s0
+	carry_scalar s1
+	carry_scalar s2
+	carry_scalar s3
+	carry_scalar s4
+	# carry *= 5
+	slli t2, t0, 2
+	add t0, t0, t2
+	carry_scalar s0
+	carry_scalar s1
+	carry_scalar s2
+	carry_scalar s3
+	carry_scalar s4
+	# any remaining stuff to carry has to be in the 2 bits we don't care about, right?
+	bne t0, zero, return
+
+	# collapse into contiguous 128 bits (s0,s2)
+	slli t0, s1, 26
+	or s0, s0, t0
+	slli t0, s2, 52
+	or s0, s0, t0
+	srli s2, s2, 12
+	slli t0, s3, 14
+	or s2, s2, t0
+	slli t0, s4, 40
+	or s2, s2, t0
+
+	# add in other half of key (after the carry it seems)
+	ld t0, 16(a2)
+	ld t1, 24(a2)
+	add s0, s0, t0
+	sltu t0, s0, t0
+	add s2, s2, t0
+	add s2, s2, t1
+
+	# write final signature
+	sd s0, 0(a3)
+	sd s2, 8(a3)
+
+return:
+	# restore registers
+	mv a0, s11
+	ld s0, -8(sp)
+	ld s1, -16(sp)
+	ld s2, -24(sp)
+	ld s3, -32(sp)
+	ld s4, -40(sp)
+	ld s5, -48(sp)
+	ld s6, -56(sp)
+	ld s7, -64(sp)
+	ld s8, -72(sp)
+	ld s9, -80(sp)
+	ld s11, -88(sp)
+	ret