From e1de2b3f5b7065e50a4e39e13e90c33366e9c15d Mon Sep 17 00:00:00 2001 From: Mikhail Bezoyan Date: Thu, 17 Oct 2024 13:51:51 +0000 Subject: [PATCH] scrooge: Optimize serialization performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem TReusableMemoryTransport uses a ByteArrayInputStream with synchronised methods, the transport is used from a single thread and synchronization is not really required but it adds some overhead especially on newer JVMs without biased locking. Additionally this introduces additional level of indirection that JIT needs to optimize plus if we controlled the underlying buffer we could use methods from Unsafe to write primitives to the buffer without using an intermediate byte array and without using System.arraycopy which could be expensive for small arrays. Solution Introduce a single threaded implementation of MemoryTransport that directly manages the underlying byte array and also use direct memory writes for primitive types. JDK21 ``` Benchmark (size) Mode Cnt Score Error Units TransportBenchmark.encodeAirlineBaseline N/A thrpt 5 8975.741 ± 73.402 ops/s TransportBenchmark.encodeAirlineByteArrayTransport N/A thrpt 5 14632.672 ± 18.125 ops/s TransportBenchmark.encodeAirlineUnsafeTransport N/A thrpt 5 15742.416 ± 203.313 ops/s TransportBenchmark.encodeDoubleArrayBaseline 5 thrpt 5 3165165.229 ± 865.324 ops/s TransportBenchmark.encodeDoubleArrayBaseline 10 thrpt 5 1996723.513 ± 813.442 ops/s TransportBenchmark.encodeDoubleArrayBaseline 50 thrpt 5 517236.667 ± 9625.233 ops/s TransportBenchmark.encodeDoubleArrayBaseline 500 thrpt 5 55744.128 ± 31.492 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 5 thrpt 5 13034582.982 ± 113618.553 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 10 thrpt 5 7965754.309 ± 11327.671 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 50 thrpt 5 1976067.803 ± 8051.731 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 500 thrpt 5 212641.665 ± 279.586 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 5 thrpt 5 34390166.003 ± 42283.346 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 10 thrpt 5 26869677.811 ± 761342.592 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 50 thrpt 5 8028375.541 ± 75541.740 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 500 thrpt 5 1107122.950 ± 7175.220 ops/s ``` TwitterJDK 11 ``` Benchmark (size) Mode Cnt Score Error Units TransportBenchmark.encodeAirlineBaseline N/A thrpt 5 11765.483 ± 192.296 ops/s TransportBenchmark.encodeAirlineByteArrayTransport N/A thrpt 5 14030.813 ± 49.984 ops/s TransportBenchmark.encodeAirlineUnsafeTransport N/A thrpt 5 15385.809 ± 428.500 ops/s TransportBenchmark.encodeDoubleArrayBaseline 5 thrpt 5 7024298.511 ± 82939.013 ops/s TransportBenchmark.encodeDoubleArrayBaseline 10 thrpt 5 4436471.652 ± 76548.770 ops/s TransportBenchmark.encodeDoubleArrayBaseline 50 thrpt 5 1136314.914 ± 6571.789 ops/s TransportBenchmark.encodeDoubleArrayBaseline 500 thrpt 5 131403.572 ± 1452.554 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 5 thrpt 5 10450745.663 ± 23594.889 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 10 thrpt 5 6542692.999 ± 63437.149 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 50 thrpt 5 1710071.757 ± 10593.509 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 500 thrpt 5 187700.152 ± 969.060 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 5 thrpt 5 25352502.715 ± 65770.557 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 10 thrpt 5 19030775.867 ± 76415.965 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 50 thrpt 5 6069935.386 ± 19759.800 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 500 thrpt 5 698704.098 ± 1804.536 ops/s ``` TwitterJDK with -XX:-UseBiasedLocking ``` Benchmark (size) Mode Cnt Score Error Units TransportBenchmark.encodeAirlineBaseline N/A thrpt 5 8908.851 ± 94.576 ops/s TransportBenchmark.encodeAirlineByteArrayTransport N/A thrpt 5 13334.376 ± 208.466 ops/s TransportBenchmark.encodeAirlineUnsafeTransport N/A thrpt 5 14206.590 ± 120.613 ops/s TransportBenchmark.encodeDoubleArrayBaseline 5 thrpt 5 3305481.887 ± 9922.319 ops/s TransportBenchmark.encodeDoubleArrayBaseline 10 thrpt 5 2152912.133 ± 3121.711 ops/s TransportBenchmark.encodeDoubleArrayBaseline 50 thrpt 5 605487.993 ± 4952.450 ops/s TransportBenchmark.encodeDoubleArrayBaseline 500 thrpt 5 67344.671 ± 51.999 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 5 thrpt 5 9273549.548 ± 5494.655 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 10 thrpt 5 6435802.855 ± 6062.881 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 50 thrpt 5 1712034.742 ± 2193.393 ops/s TransportBenchmark.encodeDoubleArrayByteArrayTransport 500 thrpt 5 187845.588 ± 703.777 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 5 thrpt 5 25114440.253 ± 272593.238 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 10 thrpt 5 19643037.550 ± 207643.475 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 50 thrpt 5 6659417.813 ± 51079.641 ops/s TransportBenchmark.encodeDoubleArrayUnsafeTransport 500 thrpt 5 698239.188 ± 3356.311 ops/s ``` Differential Revision: https://phabricator.twitter.biz/D1176860 --- .../finagle/thrift/ClientFunction.scala | 2 +- .../ThriftRequestSerializer.scala | 17 +++++++++---- .../finagle/thrift/service/ThriftCodec.scala | 24 ++++++++++++++----- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/ClientFunction.scala b/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/ClientFunction.scala index e6d9cb6c18d..4080aebc2a8 100644 --- a/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/ClientFunction.scala +++ b/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/ClientFunction.scala @@ -104,7 +104,7 @@ object ClientFunction { tlReusableBuffer: TReusableBuffer, protocolFactory: TProtocolFactory ): ThriftClientRequest = { - val memoryBuffer = tlReusableBuffer.get() + val memoryBuffer = tlReusableBuffer.take() try { val oprot = protocolFactory.getProtocol(memoryBuffer) diff --git a/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/exp/partitioning/ThriftRequestSerializer.scala b/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/exp/partitioning/ThriftRequestSerializer.scala index c0cc2e280f3..5f5b6718ecc 100644 --- a/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/exp/partitioning/ThriftRequestSerializer.scala +++ b/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/exp/partitioning/ThriftRequestSerializer.scala @@ -2,11 +2,16 @@ package com.twitter.finagle.thrift.exp.partitioning import com.twitter.finagle.Stack import com.twitter.finagle.Thrift.param -import com.twitter.finagle.thrift.{RichClientParam, ThriftClientRequest} -import com.twitter.scrooge.{TReusableBuffer, ThriftStruct, ThriftStructIface} +import com.twitter.finagle.thrift.RichClientParam +import com.twitter.finagle.thrift.ThriftClientRequest +import com.twitter.scrooge.TReusableBuffer +import com.twitter.scrooge.ThriftStruct +import com.twitter.scrooge.ThriftStructIface import java.util import org.apache.thrift.TBase -import org.apache.thrift.protocol.{TMessage, TMessageType, TProtocolFactory} +import org.apache.thrift.protocol.TMessage +import org.apache.thrift.protocol.TMessageType +import org.apache.thrift.protocol.TProtocolFactory /** * Used by ThriftPartitioningService for message fan-out. @@ -34,7 +39,7 @@ private[partitioning] class ThriftRequestSerializer(params: Stack.Params) { args: ThriftStructIface, oneWay: Boolean ): ThriftClientRequest = { - val memoryBuffer = thriftReusableBuffer.get() + val memoryBuffer = thriftReusableBuffer.take() try { val oprot = protocolFactory.getProtocol(memoryBuffer) oprot.writeMessageBegin(new TMessage(methodName, TMessageType.CALL, 0)) @@ -50,6 +55,8 @@ private[partitioning] class ThriftRequestSerializer(params: Stack.Params) { oprot.getTransport().flush() val bytes = util.Arrays.copyOfRange(memoryBuffer.getArray(), 0, memoryBuffer.length()) new ThriftClientRequest(bytes, oneWay) - } finally thriftReusableBuffer.reset() + } finally { + thriftReusableBuffer.reset() + } } } diff --git a/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/service/ThriftCodec.scala b/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/service/ThriftCodec.scala index a93659ba781..9cf6030596a 100644 --- a/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/service/ThriftCodec.scala +++ b/finagle-thrift/src/main/scala/com/twitter/finagle/thrift/service/ThriftCodec.scala @@ -1,13 +1,25 @@ package com.twitter.finagle.thrift.service import com.twitter.finagle.context.Contexts -import com.twitter.finagle.thrift.{ClientDeserializeCtx, ThriftClientRequest, maxReusableBufferSize} -import com.twitter.finagle.{Filter, Service, SourcedException} -import com.twitter.scrooge.{TReusableBuffer, ThriftMethod, ThriftStruct, ThriftStructCodec} -import com.twitter.util.{Future, Return, Throw, Try} +import com.twitter.finagle.thrift.ClientDeserializeCtx +import com.twitter.finagle.thrift.ThriftClientRequest +import com.twitter.finagle.thrift.maxReusableBufferSize +import com.twitter.finagle.Filter +import com.twitter.finagle.Service +import com.twitter.finagle.SourcedException +import com.twitter.scrooge.TReusableBuffer +import com.twitter.scrooge.ThriftMethod +import com.twitter.scrooge.ThriftStruct +import com.twitter.scrooge.ThriftStructCodec +import com.twitter.util.Future +import com.twitter.util.Return +import com.twitter.util.Throw +import com.twitter.util.Try import java.util.Arrays import org.apache.thrift.TApplicationException -import org.apache.thrift.protocol.{TMessage, TMessageType, TProtocolFactory} +import org.apache.thrift.protocol.TMessage +import org.apache.thrift.protocol.TMessageType +import org.apache.thrift.protocol.TProtocolFactory import org.apache.thrift.transport.TMemoryInputTransport object ThriftCodec { @@ -63,7 +75,7 @@ object ThriftCodec { pf: TProtocolFactory, oneway: Boolean ): ThriftClientRequest = { - val buf = tlReusableBuffer.get() + val buf = tlReusableBuffer.take() val oprot = pf.getProtocol(buf) oprot.writeMessageBegin(new TMessage(methodName, TMessageType.CALL, 0))